From 1ac36bb6fa9bdbd86f0d910b9f62d8ff2d1d288d Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Mon, 17 Jul 2023 11:23:10 +0200 Subject: [PATCH] Initial categorizer implementation --- CMakeLists.txt | 29 +- include/dwarfs/categorizer.h | 197 +++++++++++++ include/dwarfs/file_category.h | 72 +++++ include/dwarfs/inode.h | 2 + include/dwarfs/inode_manager.h | 11 + include/dwarfs/options.h | 9 +- src/dwarfs/categorizer.cpp | 268 ++++++++++++++++++ src/dwarfs/categorizer/binary_categorizer.cpp | 129 +++++++++ .../incompressible_categorizer.cpp | 232 +++++++++++++++ .../categorizer/libmagic_categorizer.cpp | 198 +++++++++++++ src/dwarfs/inode_manager.cpp | 51 +++- src/dwarfs/scanner.cpp | 9 + src/mkdwarfs_main.cpp | 19 +- vcpkg.json | 1 + 14 files changed, 1219 insertions(+), 8 deletions(-) create mode 100644 include/dwarfs/categorizer.h create mode 100644 include/dwarfs/file_category.h create mode 100644 src/dwarfs/categorizer.cpp create mode 100644 src/dwarfs/categorizer/binary_categorizer.cpp create mode 100644 src/dwarfs/categorizer/incompressible_categorizer.cpp create mode 100644 src/dwarfs/categorizer/libmagic_categorizer.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index dfab4426..db10fbc6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -212,6 +212,7 @@ if(PKG_CONFIG_FOUND) pkg_check_modules(LIBBROTLIDEC IMPORTED_TARGET libbrotlidec>=1.0.9) pkg_check_modules(LIBBROTLIENC IMPORTED_TARGET libbrotlienc>=1.0.9) pkg_check_modules(LIBARCHIVE IMPORTED_TARGET libarchive>=3.6.0) + pkg_check_modules(LIBMAGIC IMPORTED_TARGET libmagic>=5.38) pkg_check_modules(ZSTD IMPORTED_TARGET libzstd>=1.5.2) pkg_check_modules(XXHASH IMPORTED_TARGET libxxhash>=0.8.1) endif() @@ -356,6 +357,7 @@ list( src/dwarfs/block_range.cpp src/dwarfs/builtin_script.cpp src/dwarfs/cached_block.cpp + src/dwarfs/categorizer.cpp src/dwarfs/checksum.cpp src/dwarfs/chmod_transformer.cpp src/dwarfs/console_writer.cpp @@ -413,8 +415,20 @@ if(LIBBROTLIDEC_FOUND AND LIBBROTLIENC_FOUND) list(APPEND LIBDWARFS_COMPRESSION_SRC src/dwarfs/compression/brotli.cpp) endif() +list( + APPEND + LIBDWARFS_CATEGORIZER_SRC + src/dwarfs/categorizer/binary_categorizer.cpp + src/dwarfs/categorizer/incompressible_categorizer.cpp +) + +if(LIBMAGIC_FOUND) + list(APPEND LIBDWARFS_CATEGORIZER_SRC src/dwarfs/categorizer/libmagic_categorizer.cpp) +endif() + add_library(dwarfs ${LIBDWARFS_SRC}) add_library(dwarfs_compression ${LIBDWARFS_COMPRESSION_SRC}) +add_library(dwarfs_categorizer ${LIBDWARFS_CATEGORIZER_SRC}) add_library(dwarfs_tool src/dwarfs/tool.cpp) if(DWARFS_GIT_BUILD) @@ -425,6 +439,7 @@ target_compile_definitions( dwarfs_tool PRIVATE PRJ_BUILD_ID="${CMAKE_SYSTEM_PROCESSOR}, ${CMAKE_SYSTEM}, ${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}" ) +target_link_libraries(dwarfs_categorizer folly) target_link_libraries(dwarfs_compression folly) target_link_libraries(dwarfs_tool dwarfs) @@ -719,7 +734,8 @@ target_include_directories(metadata_thrift PRIVATE ${INCLUDE_DIRS}) target_link_libraries(metadata_thrift thrift_light) -foreach(tgt dwarfs dwarfs_compression dwarfs_tool ${BINARY_TARGETS} ${MAIN_TARGETS}) +foreach(tgt dwarfs dwarfs_compression dwarfs_categorizer + dwarfs_tool ${BINARY_TARGETS} ${MAIN_TARGETS}) target_include_directories( ${tgt} SYSTEM PRIVATE ${Boost_INCLUDE_DIRS} ${Python3_INCLUDE_DIRS} ${INCLUDE_DIRS} @@ -732,6 +748,7 @@ foreach(tgt dwarfs dwarfs_compression dwarfs_tool ${BINARY_TARGETS} ${MAIN_TARGE PRIVATE DWARFS_HAVE_LIBZSTD DWARFS_STATIC_BUILD=${STATIC_BUILD_DO_NOT_USE} $<$:DWARFS_USE_JEMALLOC> + $<$:DWARFS_HAVE_LIBMAGIC> $<$:DWARFS_HAVE_LIBLZ4> $<$:DWARFS_HAVE_LIBLZMA> $<$,$>:DWARFS_HAVE_LIBBROTLI> @@ -809,6 +826,10 @@ target_link_libraries( fsst ${Boost_LIBRARIES}) +if(LIBMAGIC_FOUND) + target_link_libraries(dwarfs PkgConfig::LIBMAGIC) +endif() + if(LIBLZ4_FOUND) target_link_libraries(dwarfs PkgConfig::LIBLZ4) endif() @@ -823,6 +844,7 @@ endif() if(NOT STATIC_BUILD_DO_NOT_USE) target_link_libraries(dwarfs PkgConfig::LIBARCHIVE) + target_link_libraries(dwarfs_categorizer PkgConfig::LIBMAGIC) endif(NOT STATIC_BUILD_DO_NOT_USE) if(ZSTD_FOUND AND PREFER_SYSTEM_ZSTD) @@ -850,6 +872,7 @@ foreach(tgt ${BINARY_TARGETS} ${MAIN_TARGETS}) endif() endforeach() +target_link_libraries(mkdwarfs_main "$") if(STATIC_BUILD_DO_NOT_USE) # ................................................................... @@ -883,6 +906,7 @@ if(STATIC_BUILD_DO_NOT_USE) import_static_lib(static_libssl "libssl.a") import_static_lib(static_libunwind "libunwind.a") import_static_lib(static_libarchive "libarchive.a") + import_static_lib(static_libmagic "libmagic.a") set_target_properties(static_libunwind PROPERTIES INTERFACE_LINK_LIBRARIES PkgConfig::LIBLZMA) @@ -890,7 +914,10 @@ if(STATIC_BUILD_DO_NOT_USE) static_libgflags) set_target_properties(static_librt PROPERTIES INTERFACE_LINK_LIBRARIES static_libgflags) + set_target_properties(static_libmagic PROPERTIES INTERFACE_LINK_LIBRARIES + static_libz) + target_link_libraries(dwarfs_categorizer static_libmagic) foreach(tgt ${BINARY_TARGETS}) if(PREFER_SYSTEM_LIBFMT) diff --git a/include/dwarfs/categorizer.h b/include/dwarfs/categorizer.h new file mode 100644 index 00000000..a8ce4a7c --- /dev/null +++ b/include/dwarfs/categorizer.h @@ -0,0 +1,197 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dwarfs/file_category.h" + +namespace boost::program_options { +class options_description; +class variables_map; +} // namespace boost::program_options + +namespace dwarfs { + +class logger; + +class categorizer { + public: + virtual ~categorizer() = default; + + virtual std::span categories() const = 0; +}; + +class random_access_categorizer : public categorizer { + public: + virtual std::optional + categorize(std::filesystem::path const& path, + std::span data) const = 0; +}; + +class sequential_categorizer_job { + public: + virtual ~sequential_categorizer_job() = default; + + virtual void add(std::span data) = 0; + virtual std::optional result() = 0; +}; + +class sequential_categorizer : public categorizer { + public: + virtual std::unique_ptr + job(std::filesystem::path const& path, size_t total_size) const = 0; +}; + +class categorizer_job { + public: + class impl; + + categorizer_job(); + categorizer_job(std::unique_ptr impl); + + void categorize_random_access(std::span data) { + return impl_->categorize_random_access(data); + } + + void categorize_sequential(std::span data) { + return impl_->categorize_sequential(data); + } + + file_category result() { return impl_->result(); } + + explicit operator bool() const { return impl_ != nullptr; } + + class impl { + public: + virtual ~impl() = default; + + virtual void categorize_random_access(std::span data) = 0; + virtual void categorize_sequential(std::span data) = 0; + virtual file_category result() = 0; + }; + + private: + std::unique_ptr impl_; +}; + +class categorizer_manager { + public: + categorizer_manager(logger& lgr); + + void add(std::shared_ptr c) { impl_->add(std::move(c)); } + + categorizer_job job(std::filesystem::path const& path) const { + return impl_->job(path); + } + + std::string_view category_name(file_category c) const { + return impl_->category_name(c); + } + + class impl { + public: + virtual ~impl() = default; + + virtual void add(std::shared_ptr c) = 0; + virtual categorizer_job job(std::filesystem::path const& path) const = 0; + virtual std::string_view category_name(file_category c) const = 0; + }; + + private: + std::unique_ptr impl_; +}; + +class categorizer_info { + public: + virtual ~categorizer_info() = default; + + virtual std::string_view name() const = 0; + virtual std::shared_ptr + options() const = 0; +}; + +class categorizer_factory : public categorizer_info { + public: + virtual std::unique_ptr + create(logger& lgr, + boost::program_options::variables_map const& vm) const = 0; +}; + +namespace detail { + +template +class categorizer_factory_registrar { + public: + categorizer_factory_registrar(); +}; + +} // namespace detail + +class categorizer_registry { + public: + template + friend class detail::categorizer_factory_registrar; + + static categorizer_registry& instance(); + + std::unique_ptr + create(logger& lgr, std::string const& name, + boost::program_options::variables_map const& vm) const; + + void add_options(boost::program_options::options_description& opts) const; + + std::vector categorizer_names() const; + + private: + categorizer_registry(); + ~categorizer_registry(); + + void register_factory(std::unique_ptr&& factory); + + std::map> factories_; +}; + +namespace detail { + +template +categorizer_factory_registrar::categorizer_factory_registrar() { + ::dwarfs::categorizer_registry::instance().register_factory( + std::make_unique()); +} + +} // namespace detail + +#define REGISTER_CATEGORIZER_FACTORY(factory) \ + namespace { \ + ::dwarfs::detail::categorizer_factory_registrar \ + the_##factory##_registrar; \ + } + +} // namespace dwarfs diff --git a/include/dwarfs/file_category.h b/include/dwarfs/file_category.h new file mode 100644 index 00000000..6b56aa2e --- /dev/null +++ b/include/dwarfs/file_category.h @@ -0,0 +1,72 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#pragma once + +#include +#include + +namespace dwarfs { + +class file_category { + public: + using value_type = uint32_t; + + static constexpr value_type const uninitialized{ + std::numeric_limits::max()}; + static constexpr value_type const min{0}; + static constexpr value_type const max{std::numeric_limits::max() - + 1}; + + file_category() + : value_{uninitialized} {} + file_category(value_type v) + : value_{v} {} + + file_category(file_category const&) = default; + file_category(file_category&&) = default; + + file_category& operator=(file_category const&) = default; + file_category& operator=(file_category&&) = default; + + file_category& operator=(value_type v) { + value_ = v; + return *this; + } + + value_type value() const { + if (empty()) { + throw std::range_error("file_category is uninitialized"); + } + return value_; + } + + void clear() { value_ = uninitialized; } + + bool empty() const { return value_ == uninitialized; } + + explicit operator bool() const { return !empty(); } + + private: + value_type value_; +}; + +} // namespace dwarfs diff --git a/include/dwarfs/inode.h b/include/dwarfs/inode.h index dbca37e5..a8d086a5 100644 --- a/include/dwarfs/inode.h +++ b/include/dwarfs/inode.h @@ -27,6 +27,7 @@ #include +#include "dwarfs/file_category.h" #include "dwarfs/nilsimsa.h" #include "dwarfs/object.h" @@ -58,6 +59,7 @@ class inode : public object { virtual void add_chunk(size_t block, size_t offset, size_t size) = 0; virtual void append_chunks_to(std::vector& vec) const = 0; + virtual file_category category() const = 0; }; } // namespace dwarfs diff --git a/include/dwarfs/inode_manager.h b/include/dwarfs/inode_manager.h index 386198bc..71f64cc9 100644 --- a/include/dwarfs/inode_manager.h +++ b/include/dwarfs/inode_manager.h @@ -24,6 +24,11 @@ #include #include #include +#include +#include +#include + +#include "dwarfs/file_category.h" namespace dwarfs { @@ -54,6 +59,10 @@ class inode_manager { impl_->for_each_inode_in_order(fn); } + std::vector> category_counts() const { + return impl_->category_counts(); + } + class impl { public: virtual ~impl() = default; @@ -65,6 +74,8 @@ class inode_manager { file_order_options const& file_order, order_cb const& fn) = 0; virtual void for_each_inode_in_order( std::function const&)> const& fn) const = 0; + virtual std::vector> + category_counts() const = 0; }; private: diff --git a/include/dwarfs/options.h b/include/dwarfs/options.h index c1edabc1..b4afa689 100644 --- a/include/dwarfs/options.h +++ b/include/dwarfs/options.h @@ -25,6 +25,7 @@ #include #include #include +#include #include #include "dwarfs/file_stat.h" @@ -32,6 +33,7 @@ namespace dwarfs { +class categorizer_manager; class entry; enum class mlock_mode { NONE, TRY, MUST }; @@ -78,11 +80,12 @@ struct inode_options { bool with_similarity{false}; bool with_nilsimsa{false}; std::optional max_similarity_scan_size; + std::shared_ptr categorizer_mgr; bool needs_scan(size_t size) const { - return (with_similarity || with_nilsimsa) && - (!max_similarity_scan_size || - size <= max_similarity_scan_size.value()); + return categorizer_mgr || ((with_similarity || with_nilsimsa) && + (!max_similarity_scan_size || + size <= max_similarity_scan_size.value())); } }; diff --git a/src/dwarfs/categorizer.cpp b/src/dwarfs/categorizer.cpp new file mode 100644 index 00000000..969bbeb0 --- /dev/null +++ b/src/dwarfs/categorizer.cpp @@ -0,0 +1,268 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#include +#include + +#include + +#include + +#include + +#include "dwarfs/categorizer.h" +#include "dwarfs/compiler.h" +#include "dwarfs/error.h" +#include "dwarfs/logger.h" + +namespace dwarfs { + +namespace po = boost::program_options; + +namespace { +constexpr std::string_view const DEFAULT_CATEGORY{""}; +} + +class categorizer_manager_private { + public: + virtual ~categorizer_manager_private() = default; + + virtual std::vector> const& + categorizers() const = 0; + virtual file_category category(std::string_view cat) const = 0; +}; + +template +class categorizer_job_ final : public categorizer_job::impl { + public: + categorizer_job_(logger& lgr, categorizer_manager_private const& mgr, + std::filesystem::path const& path) + : LOG_PROXY_INIT(lgr) + , mgr_{mgr} + , path_{path} {} + + void categorize_random_access(std::span data) override; + void categorize_sequential(std::span data) override; + file_category result() override; + + private: + LOG_PROXY_DECL(LoggerPolicy); + categorizer_manager_private const& mgr_; + + std::string_view best_{DEFAULT_CATEGORY}; + int index_{-1}; + bool is_global_best_{false}; + size_t total_size_hint_{0}; + std::vector>> + seq_jobs_; + std::filesystem::path const path_; +}; + +template +void categorizer_job_::categorize_random_access( + std::span data) { + DWARFS_CHECK(index_ < 0, + "internal error: index already set in categorize_random_access"); + + total_size_hint_ = data.size(); + + bool global_best = true; + + for (auto&& [index, cat] : folly::enumerate(mgr_.categorizers())) { + if (auto p = dynamic_cast(cat.get())) { + if (auto c = p->categorize(path_, data)) { + best_ = *c; + index_ = index; + is_global_best_ = global_best; + break; + } + } else { + global_best = false; + } + } +} + +template +void categorizer_job_::categorize_sequential( + std::span data) { + if (is_global_best_) { + return; + } + + if (seq_jobs_.empty()) [[unlikely]] { + for (auto&& [index, cat] : folly::enumerate(mgr_.categorizers())) { + if (index_ >= 0 && static_cast(index) >= index_) { + break; + } + + if (auto p = dynamic_cast(cat.get())) { + if (auto job = p->job(path_, total_size_hint_)) { + seq_jobs_.emplace_back(index, std::move(job)); + } + } + } + } + + for (auto&& [index, job] : seq_jobs_) { + job->add(data); + } +} + +template +file_category categorizer_job_::result() { + if (!seq_jobs_.empty()) { + for (auto&& [index, job] : seq_jobs_) { + if (auto c = job->result()) { + assert(index_ < 0 || index < index_); + best_ = *c; + break; + } + } + + seq_jobs_.clear(); + } + + LOG_TRACE << path_ << " -> " << best_; + + return mgr_.category(best_); +} + +categorizer_job::categorizer_job() = default; + +categorizer_job::categorizer_job(std::unique_ptr impl) + : impl_{std::move(impl)} {} + +template +class categorizer_manager_ final : public categorizer_manager::impl, + public categorizer_manager_private { + public: + categorizer_manager_(logger& lgr) + : lgr_{lgr} + , LOG_PROXY_INIT(lgr) { + add_category(DEFAULT_CATEGORY); + } + + void add(std::shared_ptr c) override; + categorizer_job job(std::filesystem::path const& path) const override; + std::string_view category_name(file_category c) const override; + + std::vector> const& + categorizers() const override { + return categorizers_; + } + + file_category category(std::string_view cat) const override { + auto it = catmap_.find(cat); + DWARFS_CHECK(it != catmap_.end(), fmt::format("unknown category: {}", cat)); + return it->second; + } + + private: + void add_category(std::string_view cat) { + if (catmap_.emplace(cat, categories_.size()).second) { + categories_.emplace_back(cat); + } else { + LOG_WARN << "duplicate category: " << cat; + } + } + + logger& lgr_; + LOG_PROXY_DECL(LoggerPolicy); + std::vector> categorizers_; + std::vector categories_; + std::unordered_map catmap_; +}; + +template +void categorizer_manager_::add( + std::shared_ptr c) { + for (auto const& c : c->categories()) { + add_category(c); + } + + categorizers_.emplace_back(std::move(c)); +} + +template +categorizer_job categorizer_manager_::job( + std::filesystem::path const& path) const { + return categorizer_job( + make_unique_logging_object(lgr_, *this, path)); +} + +template +std::string_view +categorizer_manager_::category_name(file_category c) const { + return DWARFS_NOTHROW(categories_.at(c.value())); +} + +categorizer_manager::categorizer_manager(logger& lgr) + : impl_(make_unique_logging_object(lgr)) {} + +categorizer_registry& categorizer_registry::instance() { + static categorizer_registry the_instance; + return the_instance; +} + +void categorizer_registry::register_factory( + std::unique_ptr&& factory) { + auto name = factory->name(); + + if (!factories_.emplace(name, std::move(factory)).second) { + std::cerr << "categorizer factory name conflict (" << name << "\n"; + ::abort(); + } +} + +std::unique_ptr +categorizer_registry::create(logger& lgr, std::string const& name, + po::variables_map const& vm) const { + auto it = factories_.find(name); + + if (it == factories_.end()) { + DWARFS_THROW(runtime_error, "unknown categorizer: " + name); + } + + return it->second->create(lgr, vm); +} + +void categorizer_registry::add_options(po::options_description& opts) const { + for (auto& f : factories_) { + if (auto f_opts = f.second->options()) { + opts.add(*f_opts); + } + } +} + +std::vector categorizer_registry::categorizer_names() const { + std::vector rv; + for (auto& f : factories_) { + rv.emplace_back(f.first); + } + return rv; +} + +categorizer_registry::categorizer_registry() = default; +categorizer_registry::~categorizer_registry() = default; + +} // namespace dwarfs diff --git a/src/dwarfs/categorizer/binary_categorizer.cpp b/src/dwarfs/categorizer/binary_categorizer.cpp new file mode 100644 index 00000000..c7c2dc58 --- /dev/null +++ b/src/dwarfs/categorizer/binary_categorizer.cpp @@ -0,0 +1,129 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include + +// TODO: this should obvs. work everywhere +#ifndef _WIN32 +#include +#endif + +#include "dwarfs/categorizer.h" +#include "dwarfs/error.h" +#include "dwarfs/logger.h" + +namespace dwarfs { + +namespace po = boost::program_options; + +namespace { + +constexpr std::string_view const SOME_CATEGORY{"bla"}; + +class binary_categorizer_base : public random_access_categorizer { + public: + std::span categories() const override; +}; + +template +class binary_categorizer_ final : public binary_categorizer_base { + public: + binary_categorizer_(logger& lgr) + : LOG_PROXY_INIT(lgr) {} + + std::optional + categorize(std::filesystem::path const& path, + std::span data) const override; + + private: + LOG_PROXY_DECL(LoggerPolicy); +}; + +std::span binary_categorizer_base::categories() const { + static constexpr std::array const s_categories{ + SOME_CATEGORY, + }; + return s_categories; +} + +template +std::optional +binary_categorizer_::categorize(std::filesystem::path const&, + std::span data + [[maybe_unused]]) const { +#ifndef _WIN32 + auto p = data.data(); + if (data.size() >= EI_NIDENT && ::memcmp(p, ELFMAG, 4) == 0) { + switch (p[EI_OSABI]) { + case ELFOSABI_SYSV: // 0 /* UNIX System V ABI */ + case ELFOSABI_HPUX: // 1 /* HP-UX */ + case ELFOSABI_NETBSD: // 2 /* NetBSD. */ + case ELFOSABI_GNU: // 3 /* Object uses GNU ELF extensions. */ + case ELFOSABI_SOLARIS: // 6 /* Sun Solaris. */ + case ELFOSABI_AIX: // 7 /* IBM AIX. */ + case ELFOSABI_IRIX: // 8 /* SGI Irix. */ + case ELFOSABI_FREEBSD: // 9 /* FreeBSD. */ + case ELFOSABI_TRU64: // 10 /* Compaq TRU64 UNIX. */ + case ELFOSABI_MODESTO: // 11 /* Novell Modesto. */ + case ELFOSABI_OPENBSD: // 12 /* OpenBSD. */ + case ELFOSABI_ARM_AEABI: // 64 /* ARM EABI */ + case ELFOSABI_ARM: // 97 /* ARM */ + case ELFOSABI_STANDALONE: // 255 /* Standalone (embedded) application */ + break; + } + } +#endif + + return std::nullopt; +} + +class binary_categorizer_factory : public categorizer_factory { + public: + std::string_view name() const override { return "binary"; } + + std::shared_ptr + options() const override { + return nullptr; + } + + std::unique_ptr + create(logger& lgr, po::variables_map const& /*vm*/) const override { + return make_unique_logging_object(lgr); + } + + private: +}; + +} // namespace + +REGISTER_CATEGORIZER_FACTORY(binary_categorizer_factory) + +} // namespace dwarfs diff --git a/src/dwarfs/categorizer/incompressible_categorizer.cpp b/src/dwarfs/categorizer/incompressible_categorizer.cpp new file mode 100644 index 00000000..665b4cf6 --- /dev/null +++ b/src/dwarfs/categorizer/incompressible_categorizer.cpp @@ -0,0 +1,232 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#include +#include +#include +#include + +#include + +#include + +#include + +#include "dwarfs/categorizer.h" +#include "dwarfs/error.h" +#include "dwarfs/logger.h" + +namespace dwarfs { + +namespace po = boost::program_options; + +namespace { + +constexpr std::string_view const INCOMPRESSIBLE_CATEGORY{"incompressible"}; + +struct incompressible_categorizer_config { + size_t min_input_size; + double max_ratio_size; + double max_ratio_blocks; + int lz4_acceleration; +}; + +template +class incompressible_categorizer_job_ : public sequential_categorizer_job { + public: + static constexpr size_t const block_size{1024 * 1024}; + + incompressible_categorizer_job_(logger& lgr, + incompressible_categorizer_config const& cfg, + std::filesystem::path const& path, + size_t total_size) + : LOG_PROXY_INIT(lgr) + , cfg_{cfg} + , path_{path} { + input_.reserve(total_size < block_size ? total_size : block_size); + state_ = ::malloc(LZ4_sizeofState()); + } + + ~incompressible_categorizer_job_() { ::free(state_); } + + void add(std::span data) override { + while (!data.empty()) { + auto part_size = input_.size() + data.size() <= block_size + ? data.size() + : block_size - input_.size(); + add_input(data.first(part_size)); + data = data.subspan(part_size); + } + } + + std::optional result() override { + if (!input_.empty()) { + compress(); + } + LOG_TRACE << path_ << " -> blocks: " << incompressible_blocks_ << "/" + << total_blocks_ << ", total compression ratio: " + << fmt::format("{:.2f}%", + 100.0 * total_output_size_ / total_input_size_); + if (total_blocks_ > 0 && + (total_output_size_ >= cfg_.max_ratio_size * total_input_size_ || + incompressible_blocks_ >= cfg_.max_ratio_blocks * total_blocks_)) { + return INCOMPRESSIBLE_CATEGORY; + } + return std::nullopt; + } + + private: + void add_input(std::span data) { + auto current_size = input_.size(); + assert(current_size + data.size() <= block_size); + input_.resize(current_size + data.size()); + ::memcpy(&input_[current_size], data.data(), data.size()); + if (input_.size() == block_size) { + compress(); + } + } + + void compress() { + total_input_size_ += input_.size(); + + output_.resize(::LZ4_compressBound(input_.size())); + + auto rv = ::LZ4_compress_fast_extState( + state_, reinterpret_cast(input_.data()), + reinterpret_cast(output_.data()), input_.size(), output_.size(), + cfg_.lz4_acceleration); + + if (rv == 0) { + DWARFS_THROW(runtime_error, + "unexpected error in LZ4_compress_fast_extState"); + } + + total_output_size_ += rv; + ++total_blocks_; + + if (rv >= static_cast(cfg_.max_ratio_size * input_.size())) { + ++incompressible_blocks_; + } + + input_.clear(); + } + + LOG_PROXY_DECL(LoggerPolicy); + void* state_; + std::vector input_; + std::vector output_; + size_t total_input_size_{0}; + size_t total_output_size_{0}; + size_t total_blocks_{0}; + size_t incompressible_blocks_{0}; + incompressible_categorizer_config const& cfg_; + std::filesystem::path const& path_; +}; + +class incompressible_categorizer_ final : public sequential_categorizer { + public: + incompressible_categorizer_(logger& lgr, + incompressible_categorizer_config const& cfg); + + std::span categories() const override; + std::unique_ptr + job(std::filesystem::path const& path, size_t total_size) const override; + + private: + logger& lgr_; + incompressible_categorizer_config const config_; +}; + +incompressible_categorizer_::incompressible_categorizer_( + logger& lgr, incompressible_categorizer_config const& cfg) + : lgr_{lgr} + , config_{cfg} {} + +std::span +incompressible_categorizer_::categories() const { + static constexpr std::array const s_categories{ + INCOMPRESSIBLE_CATEGORY, + }; + return s_categories; +} + +std::unique_ptr +incompressible_categorizer_::job(std::filesystem::path const& path, + size_t total_size) const { + if (total_size < config_.min_input_size) { + return nullptr; + } + + return make_unique_logging_object(lgr_, config_, path, + total_size); +} + +class incompressible_categorizer_factory : public categorizer_factory { + public: + incompressible_categorizer_factory() + : opts_{std::make_shared( + "Incompressible categorizer options")} { + static constexpr double const default_ratio{0.99}; + auto const default_ratio_str{fmt::format("{:.2f}", default_ratio)}; + // clang-format off + opts_->add_options() + ("incompressible-min-input-size", + po::value(&cfg_.min_input_size)->default_value(256), + "minimum file size in bytes to check for incompressibility") + ("incompressible-max-size-ratio", + po::value(&cfg_.max_ratio_size) + ->default_value(default_ratio, default_ratio_str), + "LZ4 compression ratio above files are considered incompressible") + ("incompressible-max-blocks-ratio", + po::value(&cfg_.max_ratio_blocks) + ->default_value(default_ratio, default_ratio_str), + "ratio of incompressible LZ4 blocks above which the whole file" + " is considered incompressible") + ("incompressible-lz4-acceleration (1..65537)", + po::value(&cfg_.lz4_acceleration)->default_value(1), + "LZ4 acceleration value") + ; + // clang-format on + } + + std::string_view name() const override { return "incompressible"; } + + std::shared_ptr options() const override { + return opts_; + } + + std::unique_ptr + create(logger& lgr, po::variables_map const& /*vm*/) const override { + return std::make_unique(lgr, cfg_); + } + + private: + incompressible_categorizer_config cfg_; + std::shared_ptr opts_; +}; + +} // namespace + +REGISTER_CATEGORIZER_FACTORY(incompressible_categorizer_factory) + +} // namespace dwarfs diff --git a/src/dwarfs/categorizer/libmagic_categorizer.cpp b/src/dwarfs/categorizer/libmagic_categorizer.cpp new file mode 100644 index 00000000..c26b961c --- /dev/null +++ b/src/dwarfs/categorizer/libmagic_categorizer.cpp @@ -0,0 +1,198 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#include +#include +#include +#include +#include + +#include + +#include + +#include + +#include + +#include "dwarfs/categorizer.h" +#include "dwarfs/error.h" +#include "dwarfs/logger.h" + +namespace dwarfs { + +namespace { + +namespace po = boost::program_options; + +constexpr std::string_view const SOME_CATEGORY{"bla"}; + +std::unordered_set executable_mime_types{ + "application/x-executable", + "application/x-sharedlib", +}; + +class magic_wrapper { + public: + magic_wrapper() = default; + + size_t cookie_count() const { + auto rlock = cookies_.rlock(); + return rlock->size(); + } + + std::string identify(std::span data) const { + std::string rv; + scoped_cookie m(*this); + if (auto id = ::magic_buffer(m.get(), data.data(), data.size())) { + rv.assign(id); + } + if (rv.starts_with("application/")) { + ::magic_setflags(m.get(), MAGIC_NONE); + if (auto id = ::magic_buffer(m.get(), data.data(), data.size())) { + rv += "; " + std::string(id); + } + ::magic_setflags(m.get(), MAGIC_MIME_TYPE); + } + return rv; + } + + private: + using magic_cookie_t = + std::unique_ptr; + + magic_cookie_t new_cookie() const { + magic_cookie_t m(::magic_open(MAGIC_MIME_TYPE), &::magic_close); + if (!m) { + throw std::runtime_error("could not create magic cookie"); + } + if (::magic_load(m.get(), NULL) != 0) { + throw std::runtime_error( + fmt::format("(magic) {}", ::magic_error(m.get()))); + } + return m; + } + + class scoped_cookie { + public: + scoped_cookie(magic_wrapper const& w) + : cookie_{get_scoped_cookie(w)} + , w_{w} {} + + ~scoped_cookie() { + auto wlock = w_.cookies_.wlock(); + wlock->push(std::move(cookie_)); + } + + ::magic_t get() const { return cookie_.get(); } + + private: + static magic_cookie_t get_scoped_cookie(magic_wrapper const& w) { + auto wlock = w.cookies_.wlock(); + if (wlock->empty()) [[unlikely]] { + return w.new_cookie(); + } + auto cookie = std::move(wlock->top()); + wlock->pop(); + return cookie; + } + + magic_cookie_t cookie_; + magic_wrapper const& w_; + }; + + mutable folly::Synchronized> cookies_; +}; + +class libmagic_categorizer_base : public random_access_categorizer { + public: + std::span categories() const override; +}; + +template +class libmagic_categorizer_ final : public libmagic_categorizer_base { + public: + explicit libmagic_categorizer_(logger& lgr) + : LOG_PROXY_INIT(lgr) {} + + ~libmagic_categorizer_() { + LOG_INFO << m_.cookie_count() << " magic cookies were used"; + { + auto rlock = mimetypes_.rlock(); + for (auto const& [k, v] : *rlock) { + LOG_INFO << k << " -> " << v; + } + } + } + + std::optional + categorize(std::filesystem::path const& path, + std::span data) const override; + + private: + LOG_PROXY_DECL(LoggerPolicy); + magic_wrapper m_; + mutable folly::Synchronized> mimetypes_; +}; + +std::span +libmagic_categorizer_base::categories() const { + static constexpr std::array const s_categories{ + SOME_CATEGORY, + }; + return s_categories; +} + +template +std::optional libmagic_categorizer_::categorize( + std::filesystem::path const& path, std::span data) const { + auto id = m_.identify(data); + LOG_DEBUG << path << " -> (magic) " << id; + { + auto wlock = mimetypes_.wlock(); + ++(*wlock)[id]; + } + return std::nullopt; +} + +class libmagic_categorizer_factory : public categorizer_factory { + public: + std::string_view name() const override { return "libmagic"; } + + std::shared_ptr + options() const override { + return nullptr; + } + + std::unique_ptr + create(logger& lgr, po::variables_map const& /*vm*/) const override { + return make_unique_logging_object(lgr); + } + + private: +}; + +} // namespace + +REGISTER_CATEGORIZER_FACTORY(libmagic_categorizer_factory) + +} // namespace dwarfs diff --git a/src/dwarfs/inode_manager.cpp b/src/dwarfs/inode_manager.cpp index cfdd5024..3d05bce0 100644 --- a/src/dwarfs/inode_manager.cpp +++ b/src/dwarfs/inode_manager.cpp @@ -29,10 +29,12 @@ #include #include #include +#include #include #include +#include "dwarfs/categorizer.h" #include "dwarfs/compiler.h" #include "dwarfs/entry.h" #include "dwarfs/error.h" @@ -144,8 +146,19 @@ class inode_ : public inode { similarity sc; nilsimsa nc; + categorizer_job catjob; + + if (opts.categorizer_mgr) { + catjob = + opts.categorizer_mgr->job(mm ? mm->path().string() : ""); + } + if (mm) { - auto update_hashes = [&](uint8_t const* data, size_t size) { + if (catjob) { + catjob.categorize_random_access(mm->span()); + } + + auto scan_sequential = [&](uint8_t const* data, size_t size) { if (opts.with_similarity) { sc.update(data, size); } @@ -153,6 +166,10 @@ class inode_ : public inode { if (opts.with_nilsimsa) { nc.update(data, size); } + + if (catjob) { + catjob.categorize_sequential(std::span(data, size)); + } }; constexpr size_t chunk_size = 32 << 20; @@ -160,13 +177,13 @@ class inode_ : public inode { size_t size = mm->size(); while (size >= chunk_size) { - update_hashes(mm->as(offset), chunk_size); + scan_sequential(mm->as(offset), chunk_size); mm->release_until(offset); offset += chunk_size; size -= chunk_size; } - update_hashes(mm->as(offset), size); + scan_sequential(mm->as(offset), size); } if (opts.with_similarity) { @@ -182,6 +199,10 @@ class inode_ : public inode { nilsimsa_valid_ = true; #endif } + + if (catjob) { + category_ = catjob.result(); + } } void add_chunk(size_t block, size_t offset, size_t size) override { @@ -207,9 +228,12 @@ class inode_ : public inode { vec.insert(vec.end(), chunks_.begin(), chunks_.end()); } + file_category category() const override { return category_; } + private: std::optional num_; uint32_t similarity_hash_{0}; + file_category category_; files_vector files_; std::vector chunks_; nilsimsa::hash_type nilsimsa_similarity_hash_; @@ -254,6 +278,27 @@ class inode_manager_ final : public inode_manager::impl { } } + std::vector> + category_counts() const override { + std::unordered_map tmp; + + for (auto const& i : inodes_) { + ++tmp[i->category().value()]; + } + + std::vector> rv; + + for (auto const& [k, v] : tmp) { + rv.emplace_back(k, v); + } + + std::sort(rv.begin(), rv.end(), [](auto const& a, auto const& b) { + return a.first.value() < b.first.value(); + }); + + return rv; + } + private: void order_inodes_by_path() { std::vector paths; diff --git a/src/dwarfs/scanner.cpp b/src/dwarfs/scanner.cpp index c3e2b96d..9c3fcab7 100644 --- a/src/dwarfs/scanner.cpp +++ b/src/dwarfs/scanner.cpp @@ -37,6 +37,7 @@ #include #include "dwarfs/block_data.h" +#include "dwarfs/categorizer.h" #include "dwarfs/entry.h" #include "dwarfs/error.h" #include "dwarfs/file_scanner.h" @@ -606,6 +607,14 @@ void scanner_::scan( << prog.duplicate_files << "/" << prog.files_found << " duplicate files"; + if (options_.inode.categorizer_mgr) { + for (auto const& cc : im.category_counts()) { + LOG_INFO << cc.second << " " + << options_.inode.categorizer_mgr->category_name(cc.first) + << " files"; + } + } + global_entry_data ge_data(options_); thrift::metadata::metadata mv2; diff --git a/src/mkdwarfs_main.cpp b/src/mkdwarfs_main.cpp index 4b0f8364..e873e73b 100644 --- a/src/mkdwarfs_main.cpp +++ b/src/mkdwarfs_main.cpp @@ -53,6 +53,7 @@ #include "dwarfs/block_compressor.h" #include "dwarfs/block_manager.h" #include "dwarfs/builtin_script.h" +#include "dwarfs/categorizer.h" #include "dwarfs/chmod_transformer.h" #include "dwarfs/console_writer.h" #include "dwarfs/entry.h" @@ -318,7 +319,7 @@ int mkdwarfs_main(int argc, sys_char** argv) { std::vector filter; size_t num_workers, num_scanner_workers; bool no_progress = false, remove_header = false, no_section_index = false, - force_overwrite = false; + force_overwrite = false, enable_categorizer = false; unsigned level; int compress_niceness; uint16_t uid, gid; @@ -391,6 +392,9 @@ int mkdwarfs_main(int argc, sys_char** argv) { ("recompress", po::value(&recompress_opts)->implicit_value("all"), "recompress an existing filesystem (none, block, metadata, all)") + ("categorize", + po::value(&enable_categorizer)->zero_tokens(), + "WIP enable categorizer") ("order", po::value(&order), order_desc.c_str()) @@ -510,6 +514,9 @@ int mkdwarfs_main(int argc, sys_char** argv) { .add(filesystem_opts) .add(metadata_opts); + auto& catreg = categorizer_registry::instance(); + catreg.add_options(opts); + po::variables_map vm; auto& sys_err_out = SYS_CERR; @@ -1021,6 +1028,14 @@ int mkdwarfs_main(int argc, sys_char** argv) { options.file_order.mode == file_order_mode::SIMILARITY; options.inode.with_nilsimsa = options.file_order.mode == file_order_mode::NILSIMSA; + if (enable_categorizer) { + options.inode.categorizer_mgr = + std::make_shared(lgr); + // TODO + for (auto const& name : catreg.categorizer_names()) { + options.inode.categorizer_mgr->add(catreg.create(lgr, name, vm)); + } + } scanner s(lgr, wg_scanner, cfg, entry_factory::create(), std::make_shared(), std::move(script), @@ -1031,6 +1046,8 @@ int mkdwarfs_main(int argc, sys_char** argv) { } else { s.scan(fsw, path, prog); } + + options.inode.categorizer_mgr.reset(); } } catch (runtime_error const& e) { LOG_ERROR << e.what(); diff --git a/vcpkg.json b/vcpkg.json index b0270274..fca846b0 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -17,6 +17,7 @@ "glog", "libarchive", "libevent", + "libmagic", "openssl", "pkgconf", "utfcpp",