diff --git a/CMakeLists.txt b/CMakeLists.txt index db10fbc6..2e4567d7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -373,6 +373,7 @@ list( src/dwarfs/fstypes.cpp src/dwarfs/fs_section.cpp src/dwarfs/global_entry_data.cpp + src/dwarfs/inode_fragments.cpp src/dwarfs/inode_manager.cpp src/dwarfs/inode_reader_v2.cpp src/dwarfs/logger.cpp diff --git a/include/dwarfs/categorizer.h b/include/dwarfs/categorizer.h index a8ce4a7c..e3440558 100644 --- a/include/dwarfs/categorizer.h +++ b/include/dwarfs/categorizer.h @@ -23,14 +23,14 @@ #include #include +#include #include #include #include -#include #include #include -#include "dwarfs/file_category.h" +#include "dwarfs/inode_fragments.h" namespace boost::program_options { class options_description; @@ -41,6 +41,9 @@ namespace dwarfs { class logger; +using category_mapper = + std::function; + class categorizer { public: virtual ~categorizer() = default; @@ -50,23 +53,26 @@ class categorizer { class random_access_categorizer : public categorizer { public: - virtual std::optional - categorize(std::filesystem::path const& path, - std::span data) const = 0; + virtual inode_fragments + categorize(std::filesystem::path const& path, std::span data, + category_mapper const& mapper) const = 0; }; +// TODO: add call to check if categorizer can return multiple fragments +// if it *can* we must run it before we start similarity hashing class sequential_categorizer_job { public: virtual ~sequential_categorizer_job() = default; virtual void add(std::span data) = 0; - virtual std::optional result() = 0; + virtual inode_fragments result() = 0; }; class sequential_categorizer : public categorizer { public: virtual std::unique_ptr - job(std::filesystem::path const& path, size_t total_size) const = 0; + job(std::filesystem::path const& path, size_t total_size, + category_mapper const& mapper) const = 0; }; class categorizer_job { @@ -76,6 +82,10 @@ class categorizer_job { categorizer_job(); categorizer_job(std::unique_ptr impl); + void set_total_size(size_t total_size) { + return impl_->set_total_size(total_size); + } + void categorize_random_access(std::span data) { return impl_->categorize_random_access(data); } @@ -84,7 +94,7 @@ class categorizer_job { return impl_->categorize_sequential(data); } - file_category result() { return impl_->result(); } + inode_fragments result() { return impl_->result(); } explicit operator bool() const { return impl_ != nullptr; } @@ -92,9 +102,10 @@ class categorizer_job { public: virtual ~impl() = default; + virtual void set_total_size(size_t total_size) = 0; virtual void categorize_random_access(std::span data) = 0; virtual void categorize_sequential(std::span data) = 0; - virtual file_category result() = 0; + virtual inode_fragments result() = 0; }; private: @@ -111,7 +122,7 @@ class categorizer_manager { return impl_->job(path); } - std::string_view category_name(file_category c) const { + std::string_view category_name(fragment_category::value_type c) const { return impl_->category_name(c); } @@ -121,7 +132,8 @@ class categorizer_manager { virtual void add(std::shared_ptr c) = 0; virtual categorizer_job job(std::filesystem::path const& path) const = 0; - virtual std::string_view category_name(file_category c) const = 0; + virtual std::string_view + category_name(fragment_category::value_type c) const = 0; }; private: diff --git a/include/dwarfs/file_category.h b/include/dwarfs/fragment_category.h similarity index 56% rename from include/dwarfs/file_category.h rename to include/dwarfs/fragment_category.h index 6b56aa2e..4bc307ee 100644 --- a/include/dwarfs/file_category.h +++ b/include/dwarfs/fragment_category.h @@ -21,12 +21,13 @@ #pragma once +#include #include #include namespace dwarfs { -class file_category { +class fragment_category { public: using value_type = uint32_t; @@ -36,37 +37,60 @@ class file_category { static constexpr value_type const max{std::numeric_limits::max() - 1}; - file_category() - : value_{uninitialized} {} - file_category(value_type v) + fragment_category() = default; + + explicit fragment_category(value_type v) : value_{v} {} - file_category(file_category const&) = default; - file_category(file_category&&) = default; + fragment_category(value_type v, value_type subcategory) + : value_{v} + , subcategory_{subcategory} {} - file_category& operator=(file_category const&) = default; - file_category& operator=(file_category&&) = default; + fragment_category(fragment_category const&) = default; + fragment_category(fragment_category&&) = default; - file_category& operator=(value_type v) { + fragment_category& operator=(fragment_category const&) = default; + fragment_category& operator=(fragment_category&&) = default; + + fragment_category& operator=(value_type v) { + assert(v != uninitialized); value_ = v; return *this; } value_type value() const { - if (empty()) { - throw std::range_error("file_category is uninitialized"); - } + assert(!empty()); return value_; } - void clear() { value_ = uninitialized; } + void clear() { + value_ = uninitialized; + subcategory_ = uninitialized; + } bool empty() const { return value_ == uninitialized; } explicit operator bool() const { return !empty(); } + void set_subcategory(value_type subcategory) { + assert(!empty()); + assert(subcategory != uninitialized); + subcategory_ = subcategory; + } + + bool has_subcategory() const { + return !empty() && subcategory_ != uninitialized; + } + + value_type subcategory() const { + assert(!empty()); + assert(subcategory_ != uninitialized); + return subcategory_; + } + private: - value_type value_; + value_type value_{uninitialized}; + value_type subcategory_{uninitialized}; }; } // namespace dwarfs diff --git a/include/dwarfs/inode.h b/include/dwarfs/inode.h index a8d086a5..703e113c 100644 --- a/include/dwarfs/inode.h +++ b/include/dwarfs/inode.h @@ -27,7 +27,7 @@ #include -#include "dwarfs/file_category.h" +#include "dwarfs/inode_fragments.h" #include "dwarfs/nilsimsa.h" #include "dwarfs/object.h" @@ -59,7 +59,7 @@ class inode : public object { virtual void add_chunk(size_t block, size_t offset, size_t size) = 0; virtual void append_chunks_to(std::vector& vec) const = 0; - virtual file_category category() const = 0; + virtual inode_fragments const& fragments() const = 0; }; } // namespace dwarfs diff --git a/include/dwarfs/inode_fragments.h b/include/dwarfs/inode_fragments.h new file mode 100644 index 00000000..31913227 --- /dev/null +++ b/include/dwarfs/inode_fragments.h @@ -0,0 +1,83 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#pragma once + +#include +#include +#include +#include + +#include + +#include "dwarfs/fragment_category.h" +#include "dwarfs/types.h" + +namespace dwarfs { + +class single_inode_fragment { + public: + single_inode_fragment(fragment_category category, file_off_t length) + : category_{category} + , length_{length} {} + + fragment_category category() const { return category_; } + file_off_t length() const { return length_; } + file_off_t size() const { return length_; } + + private: + fragment_category category_; + file_off_t length_; +}; + +class inode_fragments { + public: + using mapper_function_type = + std::function; + + inode_fragments() = default; + + single_inode_fragment& + emplace_back(fragment_category category, file_off_t length) { + return fragments_.emplace_back(category, length); + } + + std::span span() const { return fragments_; } + + bool empty() const { return fragments_.empty(); } + + explicit operator bool() const { return !empty(); } + + std::ostream& + to_stream(std::ostream& os, + mapper_function_type const& mapper = mapper_function_type()) const; + std::string + to_string(mapper_function_type const& mapper = mapper_function_type()) const; + + private: + folly::small_vector fragments_; +}; + +inline std::ostream& operator<<(std::ostream& os, inode_fragments const& frag) { + return frag.to_stream(os); +} + +} // namespace dwarfs diff --git a/include/dwarfs/inode_manager.h b/include/dwarfs/inode_manager.h index 71f64cc9..1678fb53 100644 --- a/include/dwarfs/inode_manager.h +++ b/include/dwarfs/inode_manager.h @@ -28,7 +28,7 @@ #include #include -#include "dwarfs/file_category.h" +#include "dwarfs/fragment_category.h" namespace dwarfs { @@ -59,7 +59,8 @@ class inode_manager { impl_->for_each_inode_in_order(fn); } - std::vector> category_counts() const { + std::vector> + category_counts() const { return impl_->category_counts(); } @@ -74,7 +75,7 @@ class inode_manager { file_order_options const& file_order, order_cb const& fn) = 0; virtual void for_each_inode_in_order( std::function const&)> const& fn) const = 0; - virtual std::vector> + virtual std::vector> category_counts() const = 0; }; diff --git a/src/dwarfs/categorizer.cpp b/src/dwarfs/categorizer.cpp index 969bbeb0..067100c0 100644 --- a/src/dwarfs/categorizer.cpp +++ b/src/dwarfs/categorizer.cpp @@ -35,19 +35,20 @@ namespace dwarfs { +using namespace std::placeholders; + namespace po = boost::program_options; namespace { constexpr std::string_view const DEFAULT_CATEGORY{""}; } -class categorizer_manager_private { +class categorizer_manager_private : public categorizer_manager::impl { public: - virtual ~categorizer_manager_private() = default; - virtual std::vector> const& categorizers() const = 0; - virtual file_category category(std::string_view cat) const = 0; + virtual fragment_category::value_type + category(std::string_view cat) const = 0; }; template @@ -57,39 +58,48 @@ class categorizer_job_ final : public categorizer_job::impl { std::filesystem::path const& path) : LOG_PROXY_INIT(lgr) , mgr_{mgr} - , path_{path} {} + , path_{path} + , cat_mapper_{std::bind(&categorizer_manager_private::category, + std::cref(mgr_), _1)} {} + void set_total_size(size_t total_size) override; void categorize_random_access(std::span data) override; void categorize_sequential(std::span data) override; - file_category result() override; + inode_fragments result() override; private: LOG_PROXY_DECL(LoggerPolicy); categorizer_manager_private const& mgr_; - std::string_view best_{DEFAULT_CATEGORY}; + inode_fragments best_; int index_{-1}; bool is_global_best_{false}; - size_t total_size_hint_{0}; + size_t total_size_{0}; std::vector>> seq_jobs_; std::filesystem::path const path_; + category_mapper cat_mapper_; }; +template +void categorizer_job_::set_total_size(size_t total_size) { + total_size_ = total_size; +} + template void categorizer_job_::categorize_random_access( std::span data) { DWARFS_CHECK(index_ < 0, "internal error: index already set in categorize_random_access"); - total_size_hint_ = data.size(); + total_size_ = data.size(); bool global_best = true; for (auto&& [index, cat] : folly::enumerate(mgr_.categorizers())) { if (auto p = dynamic_cast(cat.get())) { - if (auto c = p->categorize(path_, data)) { - best_ = *c; + if (auto c = p->categorize(path_, data, cat_mapper_)) { + best_ = c; index_ = index; is_global_best_ = global_best; break; @@ -114,7 +124,7 @@ void categorizer_job_::categorize_sequential( } if (auto p = dynamic_cast(cat.get())) { - if (auto job = p->job(path_, total_size_hint_)) { + if (auto job = p->job(path_, total_size_, cat_mapper_)) { seq_jobs_.emplace_back(index, std::move(job)); } } @@ -127,12 +137,12 @@ void categorizer_job_::categorize_sequential( } template -file_category categorizer_job_::result() { +inode_fragments categorizer_job_::result() { if (!seq_jobs_.empty()) { for (auto&& [index, job] : seq_jobs_) { if (auto c = job->result()) { assert(index_ < 0 || index < index_); - best_ = *c; + best_ = c; break; } } @@ -140,9 +150,12 @@ file_category categorizer_job_::result() { seq_jobs_.clear(); } - LOG_TRACE << path_ << " -> " << best_; + LOG_TRACE << path_ << " -> " + << best_.to_string([this](fragment_category::value_type c) { + return std::string(mgr_.category_name(c)); + }); - return mgr_.category(best_); + return best_; } categorizer_job::categorizer_job() = default; @@ -151,8 +164,7 @@ categorizer_job::categorizer_job(std::unique_ptr impl) : impl_{std::move(impl)} {} template -class categorizer_manager_ final : public categorizer_manager::impl, - public categorizer_manager_private { +class categorizer_manager_ final : public categorizer_manager_private { public: categorizer_manager_(logger& lgr) : lgr_{lgr} @@ -162,14 +174,15 @@ class categorizer_manager_ final : public categorizer_manager::impl, void add(std::shared_ptr c) override; categorizer_job job(std::filesystem::path const& path) const override; - std::string_view category_name(file_category c) const override; + std::string_view + category_name(fragment_category::value_type c) const override; std::vector> const& categorizers() const override { return categorizers_; } - file_category category(std::string_view cat) const override { + fragment_category::value_type category(std::string_view cat) const override { auto it = catmap_.find(cat); DWARFS_CHECK(it != catmap_.end(), fmt::format("unknown category: {}", cat)); return it->second; @@ -188,7 +201,7 @@ class categorizer_manager_ final : public categorizer_manager::impl, LOG_PROXY_DECL(LoggerPolicy); std::vector> categorizers_; std::vector categories_; - std::unordered_map catmap_; + std::unordered_map catmap_; }; template @@ -210,9 +223,9 @@ categorizer_job categorizer_manager_::job( } template -std::string_view -categorizer_manager_::category_name(file_category c) const { - return DWARFS_NOTHROW(categories_.at(c.value())); +std::string_view categorizer_manager_::category_name( + fragment_category::value_type c) const { + return DWARFS_NOTHROW(categories_.at(c)); } categorizer_manager::categorizer_manager(logger& lgr) diff --git a/src/dwarfs/categorizer/binary_categorizer.cpp b/src/dwarfs/categorizer/binary_categorizer.cpp index c7c2dc58..075cb85a 100644 --- a/src/dwarfs/categorizer/binary_categorizer.cpp +++ b/src/dwarfs/categorizer/binary_categorizer.cpp @@ -58,9 +58,9 @@ class binary_categorizer_ final : public binary_categorizer_base { binary_categorizer_(logger& lgr) : LOG_PROXY_INIT(lgr) {} - std::optional - categorize(std::filesystem::path const& path, - std::span data) const override; + inode_fragments + categorize(std::filesystem::path const& path, std::span data, + category_mapper const& mapper) const override; private: LOG_PROXY_DECL(LoggerPolicy); @@ -74,10 +74,12 @@ std::span binary_categorizer_base::categories() const { } template -std::optional -binary_categorizer_::categorize(std::filesystem::path const&, - std::span data - [[maybe_unused]]) const { +inode_fragments binary_categorizer_::categorize( + std::filesystem::path const&, + std::span data [[maybe_unused]], + category_mapper const& /*mapper*/) const { + inode_fragments fragments; + #ifndef _WIN32 auto p = data.data(); if (data.size() >= EI_NIDENT && ::memcmp(p, ELFMAG, 4) == 0) { @@ -101,7 +103,7 @@ binary_categorizer_::categorize(std::filesystem::path const&, } #endif - return std::nullopt; + return fragments; } class binary_categorizer_factory : public categorizer_factory { diff --git a/src/dwarfs/categorizer/incompressible_categorizer.cpp b/src/dwarfs/categorizer/incompressible_categorizer.cpp index 665b4cf6..976b2b05 100644 --- a/src/dwarfs/categorizer/incompressible_categorizer.cpp +++ b/src/dwarfs/categorizer/incompressible_categorizer.cpp @@ -57,10 +57,12 @@ class incompressible_categorizer_job_ : public sequential_categorizer_job { incompressible_categorizer_job_(logger& lgr, incompressible_categorizer_config const& cfg, std::filesystem::path const& path, - size_t total_size) + size_t total_size, + category_mapper const& mapper) : LOG_PROXY_INIT(lgr) , cfg_{cfg} - , path_{path} { + , path_{path} + , mapper_{mapper} { input_.reserve(total_size < block_size ? total_size : block_size); state_ = ::malloc(LZ4_sizeofState()); } @@ -77,7 +79,8 @@ class incompressible_categorizer_job_ : public sequential_categorizer_job { } } - std::optional result() override { + inode_fragments result() override { + inode_fragments fragments; if (!input_.empty()) { compress(); } @@ -88,9 +91,11 @@ class incompressible_categorizer_job_ : public sequential_categorizer_job { if (total_blocks_ > 0 && (total_output_size_ >= cfg_.max_ratio_size * total_input_size_ || incompressible_blocks_ >= cfg_.max_ratio_blocks * total_blocks_)) { - return INCOMPRESSIBLE_CATEGORY; + fragments.emplace_back( + fragment_category(mapper_(INCOMPRESSIBLE_CATEGORY)), + total_input_size_); } - return std::nullopt; + return fragments; } private: @@ -139,6 +144,7 @@ class incompressible_categorizer_job_ : public sequential_categorizer_job { size_t incompressible_blocks_{0}; incompressible_categorizer_config const& cfg_; std::filesystem::path const& path_; + category_mapper const& mapper_; }; class incompressible_categorizer_ final : public sequential_categorizer { @@ -148,7 +154,8 @@ class incompressible_categorizer_ final : public sequential_categorizer { std::span categories() const override; std::unique_ptr - job(std::filesystem::path const& path, size_t total_size) const override; + job(std::filesystem::path const& path, size_t total_size, + category_mapper const& mapper) const override; private: logger& lgr_; @@ -170,7 +177,8 @@ incompressible_categorizer_::categories() const { std::unique_ptr incompressible_categorizer_::job(std::filesystem::path const& path, - size_t total_size) const { + size_t total_size, + category_mapper const& mapper) const { if (total_size < config_.min_input_size) { return nullptr; } @@ -178,7 +186,7 @@ incompressible_categorizer_::job(std::filesystem::path const& path, return make_unique_logging_object(lgr_, config_, path, - total_size); + total_size, mapper); } class incompressible_categorizer_factory : public categorizer_factory { diff --git a/src/dwarfs/categorizer/libmagic_categorizer.cpp b/src/dwarfs/categorizer/libmagic_categorizer.cpp index c26b961c..47324906 100644 --- a/src/dwarfs/categorizer/libmagic_categorizer.cpp +++ b/src/dwarfs/categorizer/libmagic_categorizer.cpp @@ -143,9 +143,9 @@ class libmagic_categorizer_ final : public libmagic_categorizer_base { } } - std::optional - categorize(std::filesystem::path const& path, - std::span data) const override; + inode_fragments + categorize(std::filesystem::path const& path, std::span data, + category_mapper const& mapper) const override; private: LOG_PROXY_DECL(LoggerPolicy); @@ -162,15 +162,17 @@ libmagic_categorizer_base::categories() const { } template -std::optional libmagic_categorizer_::categorize( - std::filesystem::path const& path, std::span data) const { +inode_fragments libmagic_categorizer_::categorize( + std::filesystem::path const& path, std::span data, + category_mapper const& /*mapper*/) const { + inode_fragments fragments; // TODO: actually fill this :-) auto id = m_.identify(data); LOG_DEBUG << path << " -> (magic) " << id; { auto wlock = mimetypes_.wlock(); ++(*wlock)[id]; } - return std::nullopt; + return fragments; } class libmagic_categorizer_factory : public categorizer_factory { diff --git a/src/dwarfs/inode_fragments.cpp b/src/dwarfs/inode_fragments.cpp new file mode 100644 index 00000000..a09721a6 --- /dev/null +++ b/src/dwarfs/inode_fragments.cpp @@ -0,0 +1,74 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#include +#include + +#include "dwarfs/inode_fragments.h" + +namespace dwarfs { + +std::ostream& +inode_fragments::to_stream(std::ostream& os, + mapper_function_type const& mapper) const { + if (empty()) { + os << "(empty)"; + } else { + os << "["; + bool first = true; + + for (auto const& f : span()) { + if (first) { + first = false; + } else { + os << ", "; + } + + os << "("; + + auto const& cat = f.category(); + if (mapper) { + os << mapper(cat.value()); + } else { + os << cat.value(); + } + + if (cat.has_subcategory()) { + os << "/" << cat.subcategory(); + } + + os << ", " << f.size() << ")"; + } + + os << "]"; + } + + return os; +} + +std::string +inode_fragments::to_string(mapper_function_type const& mapper) const { + std::ostringstream oss; + to_stream(oss, mapper); + return oss.str(); +} + +} // namespace dwarfs diff --git a/src/dwarfs/inode_manager.cpp b/src/dwarfs/inode_manager.cpp index 3d05bce0..44eb7e2d 100644 --- a/src/dwarfs/inode_manager.cpp +++ b/src/dwarfs/inode_manager.cpp @@ -155,6 +155,7 @@ class inode_ : public inode { if (mm) { if (catjob) { + catjob.set_total_size(mm->size()); catjob.categorize_random_access(mm->span()); } @@ -201,7 +202,7 @@ class inode_ : public inode { } if (catjob) { - category_ = catjob.result(); + fragments_ = catjob.result(); } } @@ -228,12 +229,14 @@ class inode_ : public inode { vec.insert(vec.end(), chunks_.begin(), chunks_.end()); } - file_category category() const override { return category_; } + inode_fragments const& fragments() const override { return fragments_; } private: + // TODO: can we move optional stuff (e.g. nilsimsa_similarity_hash_) out of + // here? std::optional num_; uint32_t similarity_hash_{0}; - file_category category_; + inode_fragments fragments_; files_vector files_; std::vector chunks_; nilsimsa::hash_type nilsimsa_similarity_hash_; @@ -278,23 +281,26 @@ class inode_manager_ final : public inode_manager::impl { } } - std::vector> + std::vector> category_counts() const override { - std::unordered_map tmp; + std::unordered_map tmp; for (auto const& i : inodes_) { - ++tmp[i->category().value()]; + if (auto const& fragments = i->fragments(); !fragments.empty()) { + for (auto const& frag : fragments.span()) { + ++tmp[frag.category().value()]; + } + } } - std::vector> rv; + std::vector> rv; for (auto const& [k, v] : tmp) { rv.emplace_back(k, v); } - std::sort(rv.begin(), rv.end(), [](auto const& a, auto const& b) { - return a.first.value() < b.first.value(); - }); + std::sort(rv.begin(), rv.end(), + [](auto const& a, auto const& b) { return a.first < b.first; }); return rv; }