Initial categorizer implementation

This commit is contained in:
Marcus Holland-Moritz 2023-07-17 11:23:10 +02:00
parent 4e0d2ba25e
commit 1ac36bb6fa
14 changed files with 1219 additions and 8 deletions

View File

@ -212,6 +212,7 @@ if(PKG_CONFIG_FOUND)
pkg_check_modules(LIBBROTLIDEC IMPORTED_TARGET libbrotlidec>=1.0.9)
pkg_check_modules(LIBBROTLIENC IMPORTED_TARGET libbrotlienc>=1.0.9)
pkg_check_modules(LIBARCHIVE IMPORTED_TARGET libarchive>=3.6.0)
pkg_check_modules(LIBMAGIC IMPORTED_TARGET libmagic>=5.38)
pkg_check_modules(ZSTD IMPORTED_TARGET libzstd>=1.5.2)
pkg_check_modules(XXHASH IMPORTED_TARGET libxxhash>=0.8.1)
endif()
@ -356,6 +357,7 @@ list(
src/dwarfs/block_range.cpp
src/dwarfs/builtin_script.cpp
src/dwarfs/cached_block.cpp
src/dwarfs/categorizer.cpp
src/dwarfs/checksum.cpp
src/dwarfs/chmod_transformer.cpp
src/dwarfs/console_writer.cpp
@ -413,8 +415,20 @@ if(LIBBROTLIDEC_FOUND AND LIBBROTLIENC_FOUND)
list(APPEND LIBDWARFS_COMPRESSION_SRC src/dwarfs/compression/brotli.cpp)
endif()
list(
APPEND
LIBDWARFS_CATEGORIZER_SRC
src/dwarfs/categorizer/binary_categorizer.cpp
src/dwarfs/categorizer/incompressible_categorizer.cpp
)
if(LIBMAGIC_FOUND)
list(APPEND LIBDWARFS_CATEGORIZER_SRC src/dwarfs/categorizer/libmagic_categorizer.cpp)
endif()
add_library(dwarfs ${LIBDWARFS_SRC})
add_library(dwarfs_compression ${LIBDWARFS_COMPRESSION_SRC})
add_library(dwarfs_categorizer ${LIBDWARFS_CATEGORIZER_SRC})
add_library(dwarfs_tool src/dwarfs/tool.cpp)
if(DWARFS_GIT_BUILD)
@ -425,6 +439,7 @@ target_compile_definitions(
dwarfs_tool PRIVATE PRJ_BUILD_ID="${CMAKE_SYSTEM_PROCESSOR}, ${CMAKE_SYSTEM}, ${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}"
)
target_link_libraries(dwarfs_categorizer folly)
target_link_libraries(dwarfs_compression folly)
target_link_libraries(dwarfs_tool dwarfs)
@ -719,7 +734,8 @@ target_include_directories(metadata_thrift PRIVATE ${INCLUDE_DIRS})
target_link_libraries(metadata_thrift thrift_light)
foreach(tgt dwarfs dwarfs_compression dwarfs_tool ${BINARY_TARGETS} ${MAIN_TARGETS})
foreach(tgt dwarfs dwarfs_compression dwarfs_categorizer
dwarfs_tool ${BINARY_TARGETS} ${MAIN_TARGETS})
target_include_directories(
${tgt} SYSTEM
PRIVATE ${Boost_INCLUDE_DIRS} ${Python3_INCLUDE_DIRS} ${INCLUDE_DIRS}
@ -732,6 +748,7 @@ foreach(tgt dwarfs dwarfs_compression dwarfs_tool ${BINARY_TARGETS} ${MAIN_TARGE
PRIVATE DWARFS_HAVE_LIBZSTD
DWARFS_STATIC_BUILD=${STATIC_BUILD_DO_NOT_USE}
$<$<BOOL:${USE_JEMALLOC}>:DWARFS_USE_JEMALLOC>
$<$<BOOL:${LIBMAGIC_FOUND}>:DWARFS_HAVE_LIBMAGIC>
$<$<BOOL:${LIBLZ4_FOUND}>:DWARFS_HAVE_LIBLZ4>
$<$<BOOL:${LIBLZMA_FOUND}>:DWARFS_HAVE_LIBLZMA>
$<$<AND:$<BOOL:${LIBBROTLIDEC_FOUND}>,$<BOOL:${LIBBROTLIENC_FOUND}>>:DWARFS_HAVE_LIBBROTLI>
@ -809,6 +826,10 @@ target_link_libraries(
fsst
${Boost_LIBRARIES})
if(LIBMAGIC_FOUND)
target_link_libraries(dwarfs PkgConfig::LIBMAGIC)
endif()
if(LIBLZ4_FOUND)
target_link_libraries(dwarfs PkgConfig::LIBLZ4)
endif()
@ -823,6 +844,7 @@ endif()
if(NOT STATIC_BUILD_DO_NOT_USE)
target_link_libraries(dwarfs PkgConfig::LIBARCHIVE)
target_link_libraries(dwarfs_categorizer PkgConfig::LIBMAGIC)
endif(NOT STATIC_BUILD_DO_NOT_USE)
if(ZSTD_FOUND AND PREFER_SYSTEM_ZSTD)
@ -850,6 +872,7 @@ foreach(tgt ${BINARY_TARGETS} ${MAIN_TARGETS})
endif()
endforeach()
target_link_libraries(mkdwarfs_main "$<LINK_LIBRARY:WHOLE_ARCHIVE,dwarfs_categorizer>")
if(STATIC_BUILD_DO_NOT_USE)
# ...................................................................
@ -883,6 +906,7 @@ if(STATIC_BUILD_DO_NOT_USE)
import_static_lib(static_libssl "libssl.a")
import_static_lib(static_libunwind "libunwind.a")
import_static_lib(static_libarchive "libarchive.a")
import_static_lib(static_libmagic "libmagic.a")
set_target_properties(static_libunwind PROPERTIES INTERFACE_LINK_LIBRARIES
PkgConfig::LIBLZMA)
@ -890,7 +914,10 @@ if(STATIC_BUILD_DO_NOT_USE)
static_libgflags)
set_target_properties(static_librt PROPERTIES INTERFACE_LINK_LIBRARIES
static_libgflags)
set_target_properties(static_libmagic PROPERTIES INTERFACE_LINK_LIBRARIES
static_libz)
target_link_libraries(dwarfs_categorizer static_libmagic)
foreach(tgt ${BINARY_TARGETS})
if(PREFER_SYSTEM_LIBFMT)

View File

@ -0,0 +1,197 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/
#pragma once
#include <cstdint>
#include <filesystem>
#include <limits>
#include <map>
#include <memory>
#include <optional>
#include <span>
#include <string_view>
#include "dwarfs/file_category.h"
namespace boost::program_options {
class options_description;
class variables_map;
} // namespace boost::program_options
namespace dwarfs {
class logger;
class categorizer {
public:
virtual ~categorizer() = default;
virtual std::span<std::string_view const> categories() const = 0;
};
class random_access_categorizer : public categorizer {
public:
virtual std::optional<std::string_view>
categorize(std::filesystem::path const& path,
std::span<uint8_t const> data) const = 0;
};
class sequential_categorizer_job {
public:
virtual ~sequential_categorizer_job() = default;
virtual void add(std::span<uint8_t const> data) = 0;
virtual std::optional<std::string_view> result() = 0;
};
class sequential_categorizer : public categorizer {
public:
virtual std::unique_ptr<sequential_categorizer_job>
job(std::filesystem::path const& path, size_t total_size) const = 0;
};
class categorizer_job {
public:
class impl;
categorizer_job();
categorizer_job(std::unique_ptr<impl> impl);
void categorize_random_access(std::span<uint8_t const> data) {
return impl_->categorize_random_access(data);
}
void categorize_sequential(std::span<uint8_t const> data) {
return impl_->categorize_sequential(data);
}
file_category result() { return impl_->result(); }
explicit operator bool() const { return impl_ != nullptr; }
class impl {
public:
virtual ~impl() = default;
virtual void categorize_random_access(std::span<uint8_t const> data) = 0;
virtual void categorize_sequential(std::span<uint8_t const> data) = 0;
virtual file_category result() = 0;
};
private:
std::unique_ptr<impl> impl_;
};
class categorizer_manager {
public:
categorizer_manager(logger& lgr);
void add(std::shared_ptr<categorizer const> c) { impl_->add(std::move(c)); }
categorizer_job job(std::filesystem::path const& path) const {
return impl_->job(path);
}
std::string_view category_name(file_category c) const {
return impl_->category_name(c);
}
class impl {
public:
virtual ~impl() = default;
virtual void add(std::shared_ptr<categorizer const> c) = 0;
virtual categorizer_job job(std::filesystem::path const& path) const = 0;
virtual std::string_view category_name(file_category c) const = 0;
};
private:
std::unique_ptr<impl> impl_;
};
class categorizer_info {
public:
virtual ~categorizer_info() = default;
virtual std::string_view name() const = 0;
virtual std::shared_ptr<boost::program_options::options_description const>
options() const = 0;
};
class categorizer_factory : public categorizer_info {
public:
virtual std::unique_ptr<categorizer>
create(logger& lgr,
boost::program_options::variables_map const& vm) const = 0;
};
namespace detail {
template <typename T>
class categorizer_factory_registrar {
public:
categorizer_factory_registrar();
};
} // namespace detail
class categorizer_registry {
public:
template <typename T>
friend class detail::categorizer_factory_registrar;
static categorizer_registry& instance();
std::unique_ptr<categorizer>
create(logger& lgr, std::string const& name,
boost::program_options::variables_map const& vm) const;
void add_options(boost::program_options::options_description& opts) const;
std::vector<std::string> categorizer_names() const;
private:
categorizer_registry();
~categorizer_registry();
void register_factory(std::unique_ptr<categorizer_factory const>&& factory);
std::map<std::string, std::unique_ptr<categorizer_factory const>> factories_;
};
namespace detail {
template <typename T>
categorizer_factory_registrar<T>::categorizer_factory_registrar() {
::dwarfs::categorizer_registry::instance().register_factory(
std::make_unique<T>());
}
} // namespace detail
#define REGISTER_CATEGORIZER_FACTORY(factory) \
namespace { \
::dwarfs::detail::categorizer_factory_registrar<factory> \
the_##factory##_registrar; \
}
} // namespace dwarfs

View File

@ -0,0 +1,72 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/
#pragma once
#include <cstdint>
#include <limits>
namespace dwarfs {
class file_category {
public:
using value_type = uint32_t;
static constexpr value_type const uninitialized{
std::numeric_limits<value_type>::max()};
static constexpr value_type const min{0};
static constexpr value_type const max{std::numeric_limits<value_type>::max() -
1};
file_category()
: value_{uninitialized} {}
file_category(value_type v)
: value_{v} {}
file_category(file_category const&) = default;
file_category(file_category&&) = default;
file_category& operator=(file_category const&) = default;
file_category& operator=(file_category&&) = default;
file_category& operator=(value_type v) {
value_ = v;
return *this;
}
value_type value() const {
if (empty()) {
throw std::range_error("file_category is uninitialized");
}
return value_;
}
void clear() { value_ = uninitialized; }
bool empty() const { return value_ == uninitialized; }
explicit operator bool() const { return !empty(); }
private:
value_type value_;
};
} // namespace dwarfs

View File

@ -27,6 +27,7 @@
#include <folly/small_vector.h>
#include "dwarfs/file_category.h"
#include "dwarfs/nilsimsa.h"
#include "dwarfs/object.h"
@ -58,6 +59,7 @@ class inode : public object {
virtual void add_chunk(size_t block, size_t offset, size_t size) = 0;
virtual void
append_chunks_to(std::vector<thrift::metadata::chunk>& vec) const = 0;
virtual file_category category() const = 0;
};
} // namespace dwarfs

View File

@ -24,6 +24,11 @@
#include <cstddef>
#include <functional>
#include <memory>
#include <string_view>
#include <utility>
#include <vector>
#include "dwarfs/file_category.h"
namespace dwarfs {
@ -54,6 +59,10 @@ class inode_manager {
impl_->for_each_inode_in_order(fn);
}
std::vector<std::pair<file_category, size_t>> category_counts() const {
return impl_->category_counts();
}
class impl {
public:
virtual ~impl() = default;
@ -65,6 +74,8 @@ class inode_manager {
file_order_options const& file_order, order_cb const& fn) = 0;
virtual void for_each_inode_in_order(
std::function<void(std::shared_ptr<inode> const&)> const& fn) const = 0;
virtual std::vector<std::pair<file_category, size_t>>
category_counts() const = 0;
};
private:

View File

@ -25,6 +25,7 @@
#include <cstddef>
#include <functional>
#include <iosfwd>
#include <memory>
#include <optional>
#include "dwarfs/file_stat.h"
@ -32,6 +33,7 @@
namespace dwarfs {
class categorizer_manager;
class entry;
enum class mlock_mode { NONE, TRY, MUST };
@ -78,11 +80,12 @@ struct inode_options {
bool with_similarity{false};
bool with_nilsimsa{false};
std::optional<size_t> max_similarity_scan_size;
std::shared_ptr<categorizer_manager> categorizer_mgr;
bool needs_scan(size_t size) const {
return (with_similarity || with_nilsimsa) &&
(!max_similarity_scan_size ||
size <= max_similarity_scan_size.value());
return categorizer_mgr || ((with_similarity || with_nilsimsa) &&
(!max_similarity_scan_size ||
size <= max_similarity_scan_size.value()));
}
};

268
src/dwarfs/categorizer.cpp Normal file
View File

@ -0,0 +1,268 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/
#include <cassert>
#include <unordered_map>
#include <boost/program_options.hpp>
#include <fmt/format.h>
#include <folly/container/Enumerate.h>
#include "dwarfs/categorizer.h"
#include "dwarfs/compiler.h"
#include "dwarfs/error.h"
#include "dwarfs/logger.h"
namespace dwarfs {
namespace po = boost::program_options;
namespace {
constexpr std::string_view const DEFAULT_CATEGORY{"<default>"};
}
class categorizer_manager_private {
public:
virtual ~categorizer_manager_private() = default;
virtual std::vector<std::shared_ptr<categorizer const>> const&
categorizers() const = 0;
virtual file_category category(std::string_view cat) const = 0;
};
template <typename LoggerPolicy>
class categorizer_job_ final : public categorizer_job::impl {
public:
categorizer_job_(logger& lgr, categorizer_manager_private const& mgr,
std::filesystem::path const& path)
: LOG_PROXY_INIT(lgr)
, mgr_{mgr}
, path_{path} {}
void categorize_random_access(std::span<uint8_t const> data) override;
void categorize_sequential(std::span<uint8_t const> data) override;
file_category result() override;
private:
LOG_PROXY_DECL(LoggerPolicy);
categorizer_manager_private const& mgr_;
std::string_view best_{DEFAULT_CATEGORY};
int index_{-1};
bool is_global_best_{false};
size_t total_size_hint_{0};
std::vector<std::pair<int, std::unique_ptr<sequential_categorizer_job>>>
seq_jobs_;
std::filesystem::path const path_;
};
template <typename LoggerPolicy>
void categorizer_job_<LoggerPolicy>::categorize_random_access(
std::span<uint8_t const> data) {
DWARFS_CHECK(index_ < 0,
"internal error: index already set in categorize_random_access");
total_size_hint_ = data.size();
bool global_best = true;
for (auto&& [index, cat] : folly::enumerate(mgr_.categorizers())) {
if (auto p = dynamic_cast<random_access_categorizer const*>(cat.get())) {
if (auto c = p->categorize(path_, data)) {
best_ = *c;
index_ = index;
is_global_best_ = global_best;
break;
}
} else {
global_best = false;
}
}
}
template <typename LoggerPolicy>
void categorizer_job_<LoggerPolicy>::categorize_sequential(
std::span<uint8_t const> data) {
if (is_global_best_) {
return;
}
if (seq_jobs_.empty()) [[unlikely]] {
for (auto&& [index, cat] : folly::enumerate(mgr_.categorizers())) {
if (index_ >= 0 && static_cast<int>(index) >= index_) {
break;
}
if (auto p = dynamic_cast<sequential_categorizer const*>(cat.get())) {
if (auto job = p->job(path_, total_size_hint_)) {
seq_jobs_.emplace_back(index, std::move(job));
}
}
}
}
for (auto&& [index, job] : seq_jobs_) {
job->add(data);
}
}
template <typename LoggerPolicy>
file_category categorizer_job_<LoggerPolicy>::result() {
if (!seq_jobs_.empty()) {
for (auto&& [index, job] : seq_jobs_) {
if (auto c = job->result()) {
assert(index_ < 0 || index < index_);
best_ = *c;
break;
}
}
seq_jobs_.clear();
}
LOG_TRACE << path_ << " -> " << best_;
return mgr_.category(best_);
}
categorizer_job::categorizer_job() = default;
categorizer_job::categorizer_job(std::unique_ptr<impl> impl)
: impl_{std::move(impl)} {}
template <typename LoggerPolicy>
class categorizer_manager_ final : public categorizer_manager::impl,
public categorizer_manager_private {
public:
categorizer_manager_(logger& lgr)
: lgr_{lgr}
, LOG_PROXY_INIT(lgr) {
add_category(DEFAULT_CATEGORY);
}
void add(std::shared_ptr<categorizer const> c) override;
categorizer_job job(std::filesystem::path const& path) const override;
std::string_view category_name(file_category c) const override;
std::vector<std::shared_ptr<categorizer const>> const&
categorizers() const override {
return categorizers_;
}
file_category category(std::string_view cat) const override {
auto it = catmap_.find(cat);
DWARFS_CHECK(it != catmap_.end(), fmt::format("unknown category: {}", cat));
return it->second;
}
private:
void add_category(std::string_view cat) {
if (catmap_.emplace(cat, categories_.size()).second) {
categories_.emplace_back(cat);
} else {
LOG_WARN << "duplicate category: " << cat;
}
}
logger& lgr_;
LOG_PROXY_DECL(LoggerPolicy);
std::vector<std::shared_ptr<categorizer const>> categorizers_;
std::vector<std::string_view> categories_;
std::unordered_map<std::string_view, file_category> catmap_;
};
template <typename LoggerPolicy>
void categorizer_manager_<LoggerPolicy>::add(
std::shared_ptr<categorizer const> c) {
for (auto const& c : c->categories()) {
add_category(c);
}
categorizers_.emplace_back(std::move(c));
}
template <typename LoggerPolicy>
categorizer_job categorizer_manager_<LoggerPolicy>::job(
std::filesystem::path const& path) const {
return categorizer_job(
make_unique_logging_object<categorizer_job::impl, categorizer_job_,
logger_policies>(lgr_, *this, path));
}
template <typename LoggerPolicy>
std::string_view
categorizer_manager_<LoggerPolicy>::category_name(file_category c) const {
return DWARFS_NOTHROW(categories_.at(c.value()));
}
categorizer_manager::categorizer_manager(logger& lgr)
: impl_(make_unique_logging_object<impl, categorizer_manager_,
logger_policies>(lgr)) {}
categorizer_registry& categorizer_registry::instance() {
static categorizer_registry the_instance;
return the_instance;
}
void categorizer_registry::register_factory(
std::unique_ptr<categorizer_factory const>&& factory) {
auto name = factory->name();
if (!factories_.emplace(name, std::move(factory)).second) {
std::cerr << "categorizer factory name conflict (" << name << "\n";
::abort();
}
}
std::unique_ptr<categorizer>
categorizer_registry::create(logger& lgr, std::string const& name,
po::variables_map const& vm) const {
auto it = factories_.find(name);
if (it == factories_.end()) {
DWARFS_THROW(runtime_error, "unknown categorizer: " + name);
}
return it->second->create(lgr, vm);
}
void categorizer_registry::add_options(po::options_description& opts) const {
for (auto& f : factories_) {
if (auto f_opts = f.second->options()) {
opts.add(*f_opts);
}
}
}
std::vector<std::string> categorizer_registry::categorizer_names() const {
std::vector<std::string> rv;
for (auto& f : factories_) {
rv.emplace_back(f.first);
}
return rv;
}
categorizer_registry::categorizer_registry() = default;
categorizer_registry::~categorizer_registry() = default;
} // namespace dwarfs

View File

@ -0,0 +1,129 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/
#include <array>
#include <cstring>
#include <map>
#include <stack>
#include <unordered_set>
#include <vector>
#include <boost/program_options.hpp>
#include <fmt/format.h>
// TODO: this should obvs. work everywhere
#ifndef _WIN32
#include <elf.h>
#endif
#include "dwarfs/categorizer.h"
#include "dwarfs/error.h"
#include "dwarfs/logger.h"
namespace dwarfs {
namespace po = boost::program_options;
namespace {
constexpr std::string_view const SOME_CATEGORY{"bla"};
class binary_categorizer_base : public random_access_categorizer {
public:
std::span<std::string_view const> categories() const override;
};
template <typename LoggerPolicy>
class binary_categorizer_ final : public binary_categorizer_base {
public:
binary_categorizer_(logger& lgr)
: LOG_PROXY_INIT(lgr) {}
std::optional<std::string_view>
categorize(std::filesystem::path const& path,
std::span<uint8_t const> data) const override;
private:
LOG_PROXY_DECL(LoggerPolicy);
};
std::span<std::string_view const> binary_categorizer_base::categories() const {
static constexpr std::array const s_categories{
SOME_CATEGORY,
};
return s_categories;
}
template <typename LoggerPolicy>
std::optional<std::string_view>
binary_categorizer_<LoggerPolicy>::categorize(std::filesystem::path const&,
std::span<uint8_t const> data
[[maybe_unused]]) const {
#ifndef _WIN32
auto p = data.data();
if (data.size() >= EI_NIDENT && ::memcmp(p, ELFMAG, 4) == 0) {
switch (p[EI_OSABI]) {
case ELFOSABI_SYSV: // 0 /* UNIX System V ABI */
case ELFOSABI_HPUX: // 1 /* HP-UX */
case ELFOSABI_NETBSD: // 2 /* NetBSD. */
case ELFOSABI_GNU: // 3 /* Object uses GNU ELF extensions. */
case ELFOSABI_SOLARIS: // 6 /* Sun Solaris. */
case ELFOSABI_AIX: // 7 /* IBM AIX. */
case ELFOSABI_IRIX: // 8 /* SGI Irix. */
case ELFOSABI_FREEBSD: // 9 /* FreeBSD. */
case ELFOSABI_TRU64: // 10 /* Compaq TRU64 UNIX. */
case ELFOSABI_MODESTO: // 11 /* Novell Modesto. */
case ELFOSABI_OPENBSD: // 12 /* OpenBSD. */
case ELFOSABI_ARM_AEABI: // 64 /* ARM EABI */
case ELFOSABI_ARM: // 97 /* ARM */
case ELFOSABI_STANDALONE: // 255 /* Standalone (embedded) application */
break;
}
}
#endif
return std::nullopt;
}
class binary_categorizer_factory : public categorizer_factory {
public:
std::string_view name() const override { return "binary"; }
std::shared_ptr<boost::program_options::options_description const>
options() const override {
return nullptr;
}
std::unique_ptr<categorizer>
create(logger& lgr, po::variables_map const& /*vm*/) const override {
return make_unique_logging_object<categorizer, binary_categorizer_,
logger_policies>(lgr);
}
private:
};
} // namespace
REGISTER_CATEGORIZER_FACTORY(binary_categorizer_factory)
} // namespace dwarfs

View File

@ -0,0 +1,232 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/
#include <array>
#include <cassert>
#include <cstring>
#include <vector>
#include <boost/program_options.hpp>
#include <fmt/format.h>
#include <lz4.h>
#include "dwarfs/categorizer.h"
#include "dwarfs/error.h"
#include "dwarfs/logger.h"
namespace dwarfs {
namespace po = boost::program_options;
namespace {
constexpr std::string_view const INCOMPRESSIBLE_CATEGORY{"incompressible"};
struct incompressible_categorizer_config {
size_t min_input_size;
double max_ratio_size;
double max_ratio_blocks;
int lz4_acceleration;
};
template <typename LoggerPolicy>
class incompressible_categorizer_job_ : public sequential_categorizer_job {
public:
static constexpr size_t const block_size{1024 * 1024};
incompressible_categorizer_job_(logger& lgr,
incompressible_categorizer_config const& cfg,
std::filesystem::path const& path,
size_t total_size)
: LOG_PROXY_INIT(lgr)
, cfg_{cfg}
, path_{path} {
input_.reserve(total_size < block_size ? total_size : block_size);
state_ = ::malloc(LZ4_sizeofState());
}
~incompressible_categorizer_job_() { ::free(state_); }
void add(std::span<uint8_t const> data) override {
while (!data.empty()) {
auto part_size = input_.size() + data.size() <= block_size
? data.size()
: block_size - input_.size();
add_input(data.first(part_size));
data = data.subspan(part_size);
}
}
std::optional<std::string_view> result() override {
if (!input_.empty()) {
compress();
}
LOG_TRACE << path_ << " -> blocks: " << incompressible_blocks_ << "/"
<< total_blocks_ << ", total compression ratio: "
<< fmt::format("{:.2f}%",
100.0 * total_output_size_ / total_input_size_);
if (total_blocks_ > 0 &&
(total_output_size_ >= cfg_.max_ratio_size * total_input_size_ ||
incompressible_blocks_ >= cfg_.max_ratio_blocks * total_blocks_)) {
return INCOMPRESSIBLE_CATEGORY;
}
return std::nullopt;
}
private:
void add_input(std::span<uint8_t const> data) {
auto current_size = input_.size();
assert(current_size + data.size() <= block_size);
input_.resize(current_size + data.size());
::memcpy(&input_[current_size], data.data(), data.size());
if (input_.size() == block_size) {
compress();
}
}
void compress() {
total_input_size_ += input_.size();
output_.resize(::LZ4_compressBound(input_.size()));
auto rv = ::LZ4_compress_fast_extState(
state_, reinterpret_cast<char*>(input_.data()),
reinterpret_cast<char*>(output_.data()), input_.size(), output_.size(),
cfg_.lz4_acceleration);
if (rv == 0) {
DWARFS_THROW(runtime_error,
"unexpected error in LZ4_compress_fast_extState");
}
total_output_size_ += rv;
++total_blocks_;
if (rv >= static_cast<int>(cfg_.max_ratio_size * input_.size())) {
++incompressible_blocks_;
}
input_.clear();
}
LOG_PROXY_DECL(LoggerPolicy);
void* state_;
std::vector<uint8_t> input_;
std::vector<uint8_t> output_;
size_t total_input_size_{0};
size_t total_output_size_{0};
size_t total_blocks_{0};
size_t incompressible_blocks_{0};
incompressible_categorizer_config const& cfg_;
std::filesystem::path const& path_;
};
class incompressible_categorizer_ final : public sequential_categorizer {
public:
incompressible_categorizer_(logger& lgr,
incompressible_categorizer_config const& cfg);
std::span<std::string_view const> categories() const override;
std::unique_ptr<sequential_categorizer_job>
job(std::filesystem::path const& path, size_t total_size) const override;
private:
logger& lgr_;
incompressible_categorizer_config const config_;
};
incompressible_categorizer_::incompressible_categorizer_(
logger& lgr, incompressible_categorizer_config const& cfg)
: lgr_{lgr}
, config_{cfg} {}
std::span<std::string_view const>
incompressible_categorizer_::categories() const {
static constexpr std::array const s_categories{
INCOMPRESSIBLE_CATEGORY,
};
return s_categories;
}
std::unique_ptr<sequential_categorizer_job>
incompressible_categorizer_::job(std::filesystem::path const& path,
size_t total_size) const {
if (total_size < config_.min_input_size) {
return nullptr;
}
return make_unique_logging_object<sequential_categorizer_job,
incompressible_categorizer_job_,
logger_policies>(lgr_, config_, path,
total_size);
}
class incompressible_categorizer_factory : public categorizer_factory {
public:
incompressible_categorizer_factory()
: opts_{std::make_shared<po::options_description>(
"Incompressible categorizer options")} {
static constexpr double const default_ratio{0.99};
auto const default_ratio_str{fmt::format("{:.2f}", default_ratio)};
// clang-format off
opts_->add_options()
("incompressible-min-input-size",
po::value<size_t>(&cfg_.min_input_size)->default_value(256),
"minimum file size in bytes to check for incompressibility")
("incompressible-max-size-ratio",
po::value<double>(&cfg_.max_ratio_size)
->default_value(default_ratio, default_ratio_str),
"LZ4 compression ratio above files are considered incompressible")
("incompressible-max-blocks-ratio",
po::value<double>(&cfg_.max_ratio_blocks)
->default_value(default_ratio, default_ratio_str),
"ratio of incompressible LZ4 blocks above which the whole file"
" is considered incompressible")
("incompressible-lz4-acceleration (1..65537)",
po::value<int>(&cfg_.lz4_acceleration)->default_value(1),
"LZ4 acceleration value")
;
// clang-format on
}
std::string_view name() const override { return "incompressible"; }
std::shared_ptr<po::options_description const> options() const override {
return opts_;
}
std::unique_ptr<categorizer>
create(logger& lgr, po::variables_map const& /*vm*/) const override {
return std::make_unique<incompressible_categorizer_>(lgr, cfg_);
}
private:
incompressible_categorizer_config cfg_;
std::shared_ptr<po::options_description> opts_;
};
} // namespace
REGISTER_CATEGORIZER_FACTORY(incompressible_categorizer_factory)
} // namespace dwarfs

View File

@ -0,0 +1,198 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/
#include <array>
#include <map>
#include <stack>
#include <unordered_set>
#include <vector>
#include <boost/program_options.hpp>
#include <fmt/format.h>
#include <folly/Synchronized.h>
#include <magic.h>
#include "dwarfs/categorizer.h"
#include "dwarfs/error.h"
#include "dwarfs/logger.h"
namespace dwarfs {
namespace {
namespace po = boost::program_options;
constexpr std::string_view const SOME_CATEGORY{"bla"};
std::unordered_set<std::string_view> executable_mime_types{
"application/x-executable",
"application/x-sharedlib",
};
class magic_wrapper {
public:
magic_wrapper() = default;
size_t cookie_count() const {
auto rlock = cookies_.rlock();
return rlock->size();
}
std::string identify(std::span<uint8_t const> data) const {
std::string rv;
scoped_cookie m(*this);
if (auto id = ::magic_buffer(m.get(), data.data(), data.size())) {
rv.assign(id);
}
if (rv.starts_with("application/")) {
::magic_setflags(m.get(), MAGIC_NONE);
if (auto id = ::magic_buffer(m.get(), data.data(), data.size())) {
rv += "; " + std::string(id);
}
::magic_setflags(m.get(), MAGIC_MIME_TYPE);
}
return rv;
}
private:
using magic_cookie_t =
std::unique_ptr<struct ::magic_set, decltype(&::magic_close)>;
magic_cookie_t new_cookie() const {
magic_cookie_t m(::magic_open(MAGIC_MIME_TYPE), &::magic_close);
if (!m) {
throw std::runtime_error("could not create magic cookie");
}
if (::magic_load(m.get(), NULL) != 0) {
throw std::runtime_error(
fmt::format("(magic) {}", ::magic_error(m.get())));
}
return m;
}
class scoped_cookie {
public:
scoped_cookie(magic_wrapper const& w)
: cookie_{get_scoped_cookie(w)}
, w_{w} {}
~scoped_cookie() {
auto wlock = w_.cookies_.wlock();
wlock->push(std::move(cookie_));
}
::magic_t get() const { return cookie_.get(); }
private:
static magic_cookie_t get_scoped_cookie(magic_wrapper const& w) {
auto wlock = w.cookies_.wlock();
if (wlock->empty()) [[unlikely]] {
return w.new_cookie();
}
auto cookie = std::move(wlock->top());
wlock->pop();
return cookie;
}
magic_cookie_t cookie_;
magic_wrapper const& w_;
};
mutable folly::Synchronized<std::stack<magic_cookie_t>> cookies_;
};
class libmagic_categorizer_base : public random_access_categorizer {
public:
std::span<std::string_view const> categories() const override;
};
template <typename LoggerPolicy>
class libmagic_categorizer_ final : public libmagic_categorizer_base {
public:
explicit libmagic_categorizer_(logger& lgr)
: LOG_PROXY_INIT(lgr) {}
~libmagic_categorizer_() {
LOG_INFO << m_.cookie_count() << " magic cookies were used";
{
auto rlock = mimetypes_.rlock();
for (auto const& [k, v] : *rlock) {
LOG_INFO << k << " -> " << v;
}
}
}
std::optional<std::string_view>
categorize(std::filesystem::path const& path,
std::span<uint8_t const> data) const override;
private:
LOG_PROXY_DECL(LoggerPolicy);
magic_wrapper m_;
mutable folly::Synchronized<std::map<std::string, size_t>> mimetypes_;
};
std::span<std::string_view const>
libmagic_categorizer_base::categories() const {
static constexpr std::array const s_categories{
SOME_CATEGORY,
};
return s_categories;
}
template <typename LoggerPolicy>
std::optional<std::string_view> libmagic_categorizer_<LoggerPolicy>::categorize(
std::filesystem::path const& path, std::span<uint8_t const> data) const {
auto id = m_.identify(data);
LOG_DEBUG << path << " -> (magic) " << id;
{
auto wlock = mimetypes_.wlock();
++(*wlock)[id];
}
return std::nullopt;
}
class libmagic_categorizer_factory : public categorizer_factory {
public:
std::string_view name() const override { return "libmagic"; }
std::shared_ptr<boost::program_options::options_description const>
options() const override {
return nullptr;
}
std::unique_ptr<categorizer>
create(logger& lgr, po::variables_map const& /*vm*/) const override {
return make_unique_logging_object<categorizer, libmagic_categorizer_,
logger_policies>(lgr);
}
private:
};
} // namespace
REGISTER_CATEGORIZER_FACTORY(libmagic_categorizer_factory)
} // namespace dwarfs

View File

@ -29,10 +29,12 @@
#include <limits>
#include <numeric>
#include <string>
#include <unordered_map>
#include <vector>
#include <fmt/format.h>
#include "dwarfs/categorizer.h"
#include "dwarfs/compiler.h"
#include "dwarfs/entry.h"
#include "dwarfs/error.h"
@ -144,8 +146,19 @@ class inode_ : public inode {
similarity sc;
nilsimsa nc;
categorizer_job catjob;
if (opts.categorizer_mgr) {
catjob =
opts.categorizer_mgr->job(mm ? mm->path().string() : "<no-file>");
}
if (mm) {
auto update_hashes = [&](uint8_t const* data, size_t size) {
if (catjob) {
catjob.categorize_random_access(mm->span());
}
auto scan_sequential = [&](uint8_t const* data, size_t size) {
if (opts.with_similarity) {
sc.update(data, size);
}
@ -153,6 +166,10 @@ class inode_ : public inode {
if (opts.with_nilsimsa) {
nc.update(data, size);
}
if (catjob) {
catjob.categorize_sequential(std::span(data, size));
}
};
constexpr size_t chunk_size = 32 << 20;
@ -160,13 +177,13 @@ class inode_ : public inode {
size_t size = mm->size();
while (size >= chunk_size) {
update_hashes(mm->as<uint8_t>(offset), chunk_size);
scan_sequential(mm->as<uint8_t>(offset), chunk_size);
mm->release_until(offset);
offset += chunk_size;
size -= chunk_size;
}
update_hashes(mm->as<uint8_t>(offset), size);
scan_sequential(mm->as<uint8_t>(offset), size);
}
if (opts.with_similarity) {
@ -182,6 +199,10 @@ class inode_ : public inode {
nilsimsa_valid_ = true;
#endif
}
if (catjob) {
category_ = catjob.result();
}
}
void add_chunk(size_t block, size_t offset, size_t size) override {
@ -207,9 +228,12 @@ class inode_ : public inode {
vec.insert(vec.end(), chunks_.begin(), chunks_.end());
}
file_category category() const override { return category_; }
private:
std::optional<uint32_t> num_;
uint32_t similarity_hash_{0};
file_category category_;
files_vector files_;
std::vector<chunk_type> chunks_;
nilsimsa::hash_type nilsimsa_similarity_hash_;
@ -254,6 +278,27 @@ class inode_manager_ final : public inode_manager::impl {
}
}
std::vector<std::pair<file_category, size_t>>
category_counts() const override {
std::unordered_map<file_category::value_type, size_t> tmp;
for (auto const& i : inodes_) {
++tmp[i->category().value()];
}
std::vector<std::pair<file_category, size_t>> rv;
for (auto const& [k, v] : tmp) {
rv.emplace_back(k, v);
}
std::sort(rv.begin(), rv.end(), [](auto const& a, auto const& b) {
return a.first.value() < b.first.value();
});
return rv;
}
private:
void order_inodes_by_path() {
std::vector<std::string> paths;

View File

@ -37,6 +37,7 @@
#include <fmt/format.h>
#include "dwarfs/block_data.h"
#include "dwarfs/categorizer.h"
#include "dwarfs/entry.h"
#include "dwarfs/error.h"
#include "dwarfs/file_scanner.h"
@ -606,6 +607,14 @@ void scanner_<LoggerPolicy>::scan(
<< prog.duplicate_files << "/" << prog.files_found
<< " duplicate files";
if (options_.inode.categorizer_mgr) {
for (auto const& cc : im.category_counts()) {
LOG_INFO << cc.second << " "
<< options_.inode.categorizer_mgr->category_name(cc.first)
<< " files";
}
}
global_entry_data ge_data(options_);
thrift::metadata::metadata mv2;

View File

@ -53,6 +53,7 @@
#include "dwarfs/block_compressor.h"
#include "dwarfs/block_manager.h"
#include "dwarfs/builtin_script.h"
#include "dwarfs/categorizer.h"
#include "dwarfs/chmod_transformer.h"
#include "dwarfs/console_writer.h"
#include "dwarfs/entry.h"
@ -318,7 +319,7 @@ int mkdwarfs_main(int argc, sys_char** argv) {
std::vector<sys_string> filter;
size_t num_workers, num_scanner_workers;
bool no_progress = false, remove_header = false, no_section_index = false,
force_overwrite = false;
force_overwrite = false, enable_categorizer = false;
unsigned level;
int compress_niceness;
uint16_t uid, gid;
@ -391,6 +392,9 @@ int mkdwarfs_main(int argc, sys_char** argv) {
("recompress",
po::value<std::string>(&recompress_opts)->implicit_value("all"),
"recompress an existing filesystem (none, block, metadata, all)")
("categorize",
po::value<bool>(&enable_categorizer)->zero_tokens(),
"WIP enable categorizer")
("order",
po::value<std::string>(&order),
order_desc.c_str())
@ -510,6 +514,9 @@ int mkdwarfs_main(int argc, sys_char** argv) {
.add(filesystem_opts)
.add(metadata_opts);
auto& catreg = categorizer_registry::instance();
catreg.add_options(opts);
po::variables_map vm;
auto& sys_err_out = SYS_CERR;
@ -1021,6 +1028,14 @@ int mkdwarfs_main(int argc, sys_char** argv) {
options.file_order.mode == file_order_mode::SIMILARITY;
options.inode.with_nilsimsa =
options.file_order.mode == file_order_mode::NILSIMSA;
if (enable_categorizer) {
options.inode.categorizer_mgr =
std::make_shared<categorizer_manager>(lgr);
// TODO
for (auto const& name : catreg.categorizer_names()) {
options.inode.categorizer_mgr->add(catreg.create(lgr, name, vm));
}
}
scanner s(lgr, wg_scanner, cfg, entry_factory::create(),
std::make_shared<os_access_generic>(), std::move(script),
@ -1031,6 +1046,8 @@ int mkdwarfs_main(int argc, sys_char** argv) {
} else {
s.scan(fsw, path, prog);
}
options.inode.categorizer_mgr.reset();
}
} catch (runtime_error const& e) {
LOG_ERROR << e.what();

View File

@ -17,6 +17,7 @@
"glog",
"libarchive",
"libevent",
"libmagic",
"openssl",
"pkgconf",
"utfcpp",