mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-13 06:16:55 -04:00
Initial categorizer implementation
This commit is contained in:
parent
4e0d2ba25e
commit
1ac36bb6fa
@ -212,6 +212,7 @@ if(PKG_CONFIG_FOUND)
|
||||
pkg_check_modules(LIBBROTLIDEC IMPORTED_TARGET libbrotlidec>=1.0.9)
|
||||
pkg_check_modules(LIBBROTLIENC IMPORTED_TARGET libbrotlienc>=1.0.9)
|
||||
pkg_check_modules(LIBARCHIVE IMPORTED_TARGET libarchive>=3.6.0)
|
||||
pkg_check_modules(LIBMAGIC IMPORTED_TARGET libmagic>=5.38)
|
||||
pkg_check_modules(ZSTD IMPORTED_TARGET libzstd>=1.5.2)
|
||||
pkg_check_modules(XXHASH IMPORTED_TARGET libxxhash>=0.8.1)
|
||||
endif()
|
||||
@ -356,6 +357,7 @@ list(
|
||||
src/dwarfs/block_range.cpp
|
||||
src/dwarfs/builtin_script.cpp
|
||||
src/dwarfs/cached_block.cpp
|
||||
src/dwarfs/categorizer.cpp
|
||||
src/dwarfs/checksum.cpp
|
||||
src/dwarfs/chmod_transformer.cpp
|
||||
src/dwarfs/console_writer.cpp
|
||||
@ -413,8 +415,20 @@ if(LIBBROTLIDEC_FOUND AND LIBBROTLIENC_FOUND)
|
||||
list(APPEND LIBDWARFS_COMPRESSION_SRC src/dwarfs/compression/brotli.cpp)
|
||||
endif()
|
||||
|
||||
list(
|
||||
APPEND
|
||||
LIBDWARFS_CATEGORIZER_SRC
|
||||
src/dwarfs/categorizer/binary_categorizer.cpp
|
||||
src/dwarfs/categorizer/incompressible_categorizer.cpp
|
||||
)
|
||||
|
||||
if(LIBMAGIC_FOUND)
|
||||
list(APPEND LIBDWARFS_CATEGORIZER_SRC src/dwarfs/categorizer/libmagic_categorizer.cpp)
|
||||
endif()
|
||||
|
||||
add_library(dwarfs ${LIBDWARFS_SRC})
|
||||
add_library(dwarfs_compression ${LIBDWARFS_COMPRESSION_SRC})
|
||||
add_library(dwarfs_categorizer ${LIBDWARFS_CATEGORIZER_SRC})
|
||||
add_library(dwarfs_tool src/dwarfs/tool.cpp)
|
||||
|
||||
if(DWARFS_GIT_BUILD)
|
||||
@ -425,6 +439,7 @@ target_compile_definitions(
|
||||
dwarfs_tool PRIVATE PRJ_BUILD_ID="${CMAKE_SYSTEM_PROCESSOR}, ${CMAKE_SYSTEM}, ${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}"
|
||||
)
|
||||
|
||||
target_link_libraries(dwarfs_categorizer folly)
|
||||
target_link_libraries(dwarfs_compression folly)
|
||||
target_link_libraries(dwarfs_tool dwarfs)
|
||||
|
||||
@ -719,7 +734,8 @@ target_include_directories(metadata_thrift PRIVATE ${INCLUDE_DIRS})
|
||||
|
||||
target_link_libraries(metadata_thrift thrift_light)
|
||||
|
||||
foreach(tgt dwarfs dwarfs_compression dwarfs_tool ${BINARY_TARGETS} ${MAIN_TARGETS})
|
||||
foreach(tgt dwarfs dwarfs_compression dwarfs_categorizer
|
||||
dwarfs_tool ${BINARY_TARGETS} ${MAIN_TARGETS})
|
||||
target_include_directories(
|
||||
${tgt} SYSTEM
|
||||
PRIVATE ${Boost_INCLUDE_DIRS} ${Python3_INCLUDE_DIRS} ${INCLUDE_DIRS}
|
||||
@ -732,6 +748,7 @@ foreach(tgt dwarfs dwarfs_compression dwarfs_tool ${BINARY_TARGETS} ${MAIN_TARGE
|
||||
PRIVATE DWARFS_HAVE_LIBZSTD
|
||||
DWARFS_STATIC_BUILD=${STATIC_BUILD_DO_NOT_USE}
|
||||
$<$<BOOL:${USE_JEMALLOC}>:DWARFS_USE_JEMALLOC>
|
||||
$<$<BOOL:${LIBMAGIC_FOUND}>:DWARFS_HAVE_LIBMAGIC>
|
||||
$<$<BOOL:${LIBLZ4_FOUND}>:DWARFS_HAVE_LIBLZ4>
|
||||
$<$<BOOL:${LIBLZMA_FOUND}>:DWARFS_HAVE_LIBLZMA>
|
||||
$<$<AND:$<BOOL:${LIBBROTLIDEC_FOUND}>,$<BOOL:${LIBBROTLIENC_FOUND}>>:DWARFS_HAVE_LIBBROTLI>
|
||||
@ -809,6 +826,10 @@ target_link_libraries(
|
||||
fsst
|
||||
${Boost_LIBRARIES})
|
||||
|
||||
if(LIBMAGIC_FOUND)
|
||||
target_link_libraries(dwarfs PkgConfig::LIBMAGIC)
|
||||
endif()
|
||||
|
||||
if(LIBLZ4_FOUND)
|
||||
target_link_libraries(dwarfs PkgConfig::LIBLZ4)
|
||||
endif()
|
||||
@ -823,6 +844,7 @@ endif()
|
||||
|
||||
if(NOT STATIC_BUILD_DO_NOT_USE)
|
||||
target_link_libraries(dwarfs PkgConfig::LIBARCHIVE)
|
||||
target_link_libraries(dwarfs_categorizer PkgConfig::LIBMAGIC)
|
||||
endif(NOT STATIC_BUILD_DO_NOT_USE)
|
||||
|
||||
if(ZSTD_FOUND AND PREFER_SYSTEM_ZSTD)
|
||||
@ -850,6 +872,7 @@ foreach(tgt ${BINARY_TARGETS} ${MAIN_TARGETS})
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
target_link_libraries(mkdwarfs_main "$<LINK_LIBRARY:WHOLE_ARCHIVE,dwarfs_categorizer>")
|
||||
|
||||
if(STATIC_BUILD_DO_NOT_USE)
|
||||
# ...................................................................
|
||||
@ -883,6 +906,7 @@ if(STATIC_BUILD_DO_NOT_USE)
|
||||
import_static_lib(static_libssl "libssl.a")
|
||||
import_static_lib(static_libunwind "libunwind.a")
|
||||
import_static_lib(static_libarchive "libarchive.a")
|
||||
import_static_lib(static_libmagic "libmagic.a")
|
||||
|
||||
set_target_properties(static_libunwind PROPERTIES INTERFACE_LINK_LIBRARIES
|
||||
PkgConfig::LIBLZMA)
|
||||
@ -890,7 +914,10 @@ if(STATIC_BUILD_DO_NOT_USE)
|
||||
static_libgflags)
|
||||
set_target_properties(static_librt PROPERTIES INTERFACE_LINK_LIBRARIES
|
||||
static_libgflags)
|
||||
set_target_properties(static_libmagic PROPERTIES INTERFACE_LINK_LIBRARIES
|
||||
static_libz)
|
||||
|
||||
target_link_libraries(dwarfs_categorizer static_libmagic)
|
||||
|
||||
foreach(tgt ${BINARY_TARGETS})
|
||||
if(PREFER_SYSTEM_LIBFMT)
|
||||
|
197
include/dwarfs/categorizer.h
Normal file
197
include/dwarfs/categorizer.h
Normal file
@ -0,0 +1,197 @@
|
||||
/* vim:set ts=2 sw=2 sts=2 et: */
|
||||
/**
|
||||
* \author Marcus Holland-Moritz (github@mhxnet.de)
|
||||
* \copyright Copyright (c) Marcus Holland-Moritz
|
||||
*
|
||||
* This file is part of dwarfs.
|
||||
*
|
||||
* dwarfs is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* dwarfs is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <filesystem>
|
||||
#include <limits>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <span>
|
||||
#include <string_view>
|
||||
|
||||
#include "dwarfs/file_category.h"
|
||||
|
||||
namespace boost::program_options {
|
||||
class options_description;
|
||||
class variables_map;
|
||||
} // namespace boost::program_options
|
||||
|
||||
namespace dwarfs {
|
||||
|
||||
class logger;
|
||||
|
||||
class categorizer {
|
||||
public:
|
||||
virtual ~categorizer() = default;
|
||||
|
||||
virtual std::span<std::string_view const> categories() const = 0;
|
||||
};
|
||||
|
||||
class random_access_categorizer : public categorizer {
|
||||
public:
|
||||
virtual std::optional<std::string_view>
|
||||
categorize(std::filesystem::path const& path,
|
||||
std::span<uint8_t const> data) const = 0;
|
||||
};
|
||||
|
||||
class sequential_categorizer_job {
|
||||
public:
|
||||
virtual ~sequential_categorizer_job() = default;
|
||||
|
||||
virtual void add(std::span<uint8_t const> data) = 0;
|
||||
virtual std::optional<std::string_view> result() = 0;
|
||||
};
|
||||
|
||||
class sequential_categorizer : public categorizer {
|
||||
public:
|
||||
virtual std::unique_ptr<sequential_categorizer_job>
|
||||
job(std::filesystem::path const& path, size_t total_size) const = 0;
|
||||
};
|
||||
|
||||
class categorizer_job {
|
||||
public:
|
||||
class impl;
|
||||
|
||||
categorizer_job();
|
||||
categorizer_job(std::unique_ptr<impl> impl);
|
||||
|
||||
void categorize_random_access(std::span<uint8_t const> data) {
|
||||
return impl_->categorize_random_access(data);
|
||||
}
|
||||
|
||||
void categorize_sequential(std::span<uint8_t const> data) {
|
||||
return impl_->categorize_sequential(data);
|
||||
}
|
||||
|
||||
file_category result() { return impl_->result(); }
|
||||
|
||||
explicit operator bool() const { return impl_ != nullptr; }
|
||||
|
||||
class impl {
|
||||
public:
|
||||
virtual ~impl() = default;
|
||||
|
||||
virtual void categorize_random_access(std::span<uint8_t const> data) = 0;
|
||||
virtual void categorize_sequential(std::span<uint8_t const> data) = 0;
|
||||
virtual file_category result() = 0;
|
||||
};
|
||||
|
||||
private:
|
||||
std::unique_ptr<impl> impl_;
|
||||
};
|
||||
|
||||
class categorizer_manager {
|
||||
public:
|
||||
categorizer_manager(logger& lgr);
|
||||
|
||||
void add(std::shared_ptr<categorizer const> c) { impl_->add(std::move(c)); }
|
||||
|
||||
categorizer_job job(std::filesystem::path const& path) const {
|
||||
return impl_->job(path);
|
||||
}
|
||||
|
||||
std::string_view category_name(file_category c) const {
|
||||
return impl_->category_name(c);
|
||||
}
|
||||
|
||||
class impl {
|
||||
public:
|
||||
virtual ~impl() = default;
|
||||
|
||||
virtual void add(std::shared_ptr<categorizer const> c) = 0;
|
||||
virtual categorizer_job job(std::filesystem::path const& path) const = 0;
|
||||
virtual std::string_view category_name(file_category c) const = 0;
|
||||
};
|
||||
|
||||
private:
|
||||
std::unique_ptr<impl> impl_;
|
||||
};
|
||||
|
||||
class categorizer_info {
|
||||
public:
|
||||
virtual ~categorizer_info() = default;
|
||||
|
||||
virtual std::string_view name() const = 0;
|
||||
virtual std::shared_ptr<boost::program_options::options_description const>
|
||||
options() const = 0;
|
||||
};
|
||||
|
||||
class categorizer_factory : public categorizer_info {
|
||||
public:
|
||||
virtual std::unique_ptr<categorizer>
|
||||
create(logger& lgr,
|
||||
boost::program_options::variables_map const& vm) const = 0;
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
template <typename T>
|
||||
class categorizer_factory_registrar {
|
||||
public:
|
||||
categorizer_factory_registrar();
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
class categorizer_registry {
|
||||
public:
|
||||
template <typename T>
|
||||
friend class detail::categorizer_factory_registrar;
|
||||
|
||||
static categorizer_registry& instance();
|
||||
|
||||
std::unique_ptr<categorizer>
|
||||
create(logger& lgr, std::string const& name,
|
||||
boost::program_options::variables_map const& vm) const;
|
||||
|
||||
void add_options(boost::program_options::options_description& opts) const;
|
||||
|
||||
std::vector<std::string> categorizer_names() const;
|
||||
|
||||
private:
|
||||
categorizer_registry();
|
||||
~categorizer_registry();
|
||||
|
||||
void register_factory(std::unique_ptr<categorizer_factory const>&& factory);
|
||||
|
||||
std::map<std::string, std::unique_ptr<categorizer_factory const>> factories_;
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
template <typename T>
|
||||
categorizer_factory_registrar<T>::categorizer_factory_registrar() {
|
||||
::dwarfs::categorizer_registry::instance().register_factory(
|
||||
std::make_unique<T>());
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
#define REGISTER_CATEGORIZER_FACTORY(factory) \
|
||||
namespace { \
|
||||
::dwarfs::detail::categorizer_factory_registrar<factory> \
|
||||
the_##factory##_registrar; \
|
||||
}
|
||||
|
||||
} // namespace dwarfs
|
72
include/dwarfs/file_category.h
Normal file
72
include/dwarfs/file_category.h
Normal file
@ -0,0 +1,72 @@
|
||||
/* vim:set ts=2 sw=2 sts=2 et: */
|
||||
/**
|
||||
* \author Marcus Holland-Moritz (github@mhxnet.de)
|
||||
* \copyright Copyright (c) Marcus Holland-Moritz
|
||||
*
|
||||
* This file is part of dwarfs.
|
||||
*
|
||||
* dwarfs is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* dwarfs is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
|
||||
namespace dwarfs {
|
||||
|
||||
class file_category {
|
||||
public:
|
||||
using value_type = uint32_t;
|
||||
|
||||
static constexpr value_type const uninitialized{
|
||||
std::numeric_limits<value_type>::max()};
|
||||
static constexpr value_type const min{0};
|
||||
static constexpr value_type const max{std::numeric_limits<value_type>::max() -
|
||||
1};
|
||||
|
||||
file_category()
|
||||
: value_{uninitialized} {}
|
||||
file_category(value_type v)
|
||||
: value_{v} {}
|
||||
|
||||
file_category(file_category const&) = default;
|
||||
file_category(file_category&&) = default;
|
||||
|
||||
file_category& operator=(file_category const&) = default;
|
||||
file_category& operator=(file_category&&) = default;
|
||||
|
||||
file_category& operator=(value_type v) {
|
||||
value_ = v;
|
||||
return *this;
|
||||
}
|
||||
|
||||
value_type value() const {
|
||||
if (empty()) {
|
||||
throw std::range_error("file_category is uninitialized");
|
||||
}
|
||||
return value_;
|
||||
}
|
||||
|
||||
void clear() { value_ = uninitialized; }
|
||||
|
||||
bool empty() const { return value_ == uninitialized; }
|
||||
|
||||
explicit operator bool() const { return !empty(); }
|
||||
|
||||
private:
|
||||
value_type value_;
|
||||
};
|
||||
|
||||
} // namespace dwarfs
|
@ -27,6 +27,7 @@
|
||||
|
||||
#include <folly/small_vector.h>
|
||||
|
||||
#include "dwarfs/file_category.h"
|
||||
#include "dwarfs/nilsimsa.h"
|
||||
#include "dwarfs/object.h"
|
||||
|
||||
@ -58,6 +59,7 @@ class inode : public object {
|
||||
virtual void add_chunk(size_t block, size_t offset, size_t size) = 0;
|
||||
virtual void
|
||||
append_chunks_to(std::vector<thrift::metadata::chunk>& vec) const = 0;
|
||||
virtual file_category category() const = 0;
|
||||
};
|
||||
|
||||
} // namespace dwarfs
|
||||
|
@ -24,6 +24,11 @@
|
||||
#include <cstddef>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <string_view>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "dwarfs/file_category.h"
|
||||
|
||||
namespace dwarfs {
|
||||
|
||||
@ -54,6 +59,10 @@ class inode_manager {
|
||||
impl_->for_each_inode_in_order(fn);
|
||||
}
|
||||
|
||||
std::vector<std::pair<file_category, size_t>> category_counts() const {
|
||||
return impl_->category_counts();
|
||||
}
|
||||
|
||||
class impl {
|
||||
public:
|
||||
virtual ~impl() = default;
|
||||
@ -65,6 +74,8 @@ class inode_manager {
|
||||
file_order_options const& file_order, order_cb const& fn) = 0;
|
||||
virtual void for_each_inode_in_order(
|
||||
std::function<void(std::shared_ptr<inode> const&)> const& fn) const = 0;
|
||||
virtual std::vector<std::pair<file_category, size_t>>
|
||||
category_counts() const = 0;
|
||||
};
|
||||
|
||||
private:
|
||||
|
@ -25,6 +25,7 @@
|
||||
#include <cstddef>
|
||||
#include <functional>
|
||||
#include <iosfwd>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
|
||||
#include "dwarfs/file_stat.h"
|
||||
@ -32,6 +33,7 @@
|
||||
|
||||
namespace dwarfs {
|
||||
|
||||
class categorizer_manager;
|
||||
class entry;
|
||||
|
||||
enum class mlock_mode { NONE, TRY, MUST };
|
||||
@ -78,11 +80,12 @@ struct inode_options {
|
||||
bool with_similarity{false};
|
||||
bool with_nilsimsa{false};
|
||||
std::optional<size_t> max_similarity_scan_size;
|
||||
std::shared_ptr<categorizer_manager> categorizer_mgr;
|
||||
|
||||
bool needs_scan(size_t size) const {
|
||||
return (with_similarity || with_nilsimsa) &&
|
||||
(!max_similarity_scan_size ||
|
||||
size <= max_similarity_scan_size.value());
|
||||
return categorizer_mgr || ((with_similarity || with_nilsimsa) &&
|
||||
(!max_similarity_scan_size ||
|
||||
size <= max_similarity_scan_size.value()));
|
||||
}
|
||||
};
|
||||
|
||||
|
268
src/dwarfs/categorizer.cpp
Normal file
268
src/dwarfs/categorizer.cpp
Normal file
@ -0,0 +1,268 @@
|
||||
/* vim:set ts=2 sw=2 sts=2 et: */
|
||||
/**
|
||||
* \author Marcus Holland-Moritz (github@mhxnet.de)
|
||||
* \copyright Copyright (c) Marcus Holland-Moritz
|
||||
*
|
||||
* This file is part of dwarfs.
|
||||
*
|
||||
* dwarfs is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* dwarfs is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <cassert>
|
||||
#include <unordered_map>
|
||||
|
||||
#include <boost/program_options.hpp>
|
||||
|
||||
#include <fmt/format.h>
|
||||
|
||||
#include <folly/container/Enumerate.h>
|
||||
|
||||
#include "dwarfs/categorizer.h"
|
||||
#include "dwarfs/compiler.h"
|
||||
#include "dwarfs/error.h"
|
||||
#include "dwarfs/logger.h"
|
||||
|
||||
namespace dwarfs {
|
||||
|
||||
namespace po = boost::program_options;
|
||||
|
||||
namespace {
|
||||
constexpr std::string_view const DEFAULT_CATEGORY{"<default>"};
|
||||
}
|
||||
|
||||
class categorizer_manager_private {
|
||||
public:
|
||||
virtual ~categorizer_manager_private() = default;
|
||||
|
||||
virtual std::vector<std::shared_ptr<categorizer const>> const&
|
||||
categorizers() const = 0;
|
||||
virtual file_category category(std::string_view cat) const = 0;
|
||||
};
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
class categorizer_job_ final : public categorizer_job::impl {
|
||||
public:
|
||||
categorizer_job_(logger& lgr, categorizer_manager_private const& mgr,
|
||||
std::filesystem::path const& path)
|
||||
: LOG_PROXY_INIT(lgr)
|
||||
, mgr_{mgr}
|
||||
, path_{path} {}
|
||||
|
||||
void categorize_random_access(std::span<uint8_t const> data) override;
|
||||
void categorize_sequential(std::span<uint8_t const> data) override;
|
||||
file_category result() override;
|
||||
|
||||
private:
|
||||
LOG_PROXY_DECL(LoggerPolicy);
|
||||
categorizer_manager_private const& mgr_;
|
||||
|
||||
std::string_view best_{DEFAULT_CATEGORY};
|
||||
int index_{-1};
|
||||
bool is_global_best_{false};
|
||||
size_t total_size_hint_{0};
|
||||
std::vector<std::pair<int, std::unique_ptr<sequential_categorizer_job>>>
|
||||
seq_jobs_;
|
||||
std::filesystem::path const path_;
|
||||
};
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
void categorizer_job_<LoggerPolicy>::categorize_random_access(
|
||||
std::span<uint8_t const> data) {
|
||||
DWARFS_CHECK(index_ < 0,
|
||||
"internal error: index already set in categorize_random_access");
|
||||
|
||||
total_size_hint_ = data.size();
|
||||
|
||||
bool global_best = true;
|
||||
|
||||
for (auto&& [index, cat] : folly::enumerate(mgr_.categorizers())) {
|
||||
if (auto p = dynamic_cast<random_access_categorizer const*>(cat.get())) {
|
||||
if (auto c = p->categorize(path_, data)) {
|
||||
best_ = *c;
|
||||
index_ = index;
|
||||
is_global_best_ = global_best;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
global_best = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
void categorizer_job_<LoggerPolicy>::categorize_sequential(
|
||||
std::span<uint8_t const> data) {
|
||||
if (is_global_best_) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (seq_jobs_.empty()) [[unlikely]] {
|
||||
for (auto&& [index, cat] : folly::enumerate(mgr_.categorizers())) {
|
||||
if (index_ >= 0 && static_cast<int>(index) >= index_) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (auto p = dynamic_cast<sequential_categorizer const*>(cat.get())) {
|
||||
if (auto job = p->job(path_, total_size_hint_)) {
|
||||
seq_jobs_.emplace_back(index, std::move(job));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto&& [index, job] : seq_jobs_) {
|
||||
job->add(data);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
file_category categorizer_job_<LoggerPolicy>::result() {
|
||||
if (!seq_jobs_.empty()) {
|
||||
for (auto&& [index, job] : seq_jobs_) {
|
||||
if (auto c = job->result()) {
|
||||
assert(index_ < 0 || index < index_);
|
||||
best_ = *c;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
seq_jobs_.clear();
|
||||
}
|
||||
|
||||
LOG_TRACE << path_ << " -> " << best_;
|
||||
|
||||
return mgr_.category(best_);
|
||||
}
|
||||
|
||||
categorizer_job::categorizer_job() = default;
|
||||
|
||||
categorizer_job::categorizer_job(std::unique_ptr<impl> impl)
|
||||
: impl_{std::move(impl)} {}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
class categorizer_manager_ final : public categorizer_manager::impl,
|
||||
public categorizer_manager_private {
|
||||
public:
|
||||
categorizer_manager_(logger& lgr)
|
||||
: lgr_{lgr}
|
||||
, LOG_PROXY_INIT(lgr) {
|
||||
add_category(DEFAULT_CATEGORY);
|
||||
}
|
||||
|
||||
void add(std::shared_ptr<categorizer const> c) override;
|
||||
categorizer_job job(std::filesystem::path const& path) const override;
|
||||
std::string_view category_name(file_category c) const override;
|
||||
|
||||
std::vector<std::shared_ptr<categorizer const>> const&
|
||||
categorizers() const override {
|
||||
return categorizers_;
|
||||
}
|
||||
|
||||
file_category category(std::string_view cat) const override {
|
||||
auto it = catmap_.find(cat);
|
||||
DWARFS_CHECK(it != catmap_.end(), fmt::format("unknown category: {}", cat));
|
||||
return it->second;
|
||||
}
|
||||
|
||||
private:
|
||||
void add_category(std::string_view cat) {
|
||||
if (catmap_.emplace(cat, categories_.size()).second) {
|
||||
categories_.emplace_back(cat);
|
||||
} else {
|
||||
LOG_WARN << "duplicate category: " << cat;
|
||||
}
|
||||
}
|
||||
|
||||
logger& lgr_;
|
||||
LOG_PROXY_DECL(LoggerPolicy);
|
||||
std::vector<std::shared_ptr<categorizer const>> categorizers_;
|
||||
std::vector<std::string_view> categories_;
|
||||
std::unordered_map<std::string_view, file_category> catmap_;
|
||||
};
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
void categorizer_manager_<LoggerPolicy>::add(
|
||||
std::shared_ptr<categorizer const> c) {
|
||||
for (auto const& c : c->categories()) {
|
||||
add_category(c);
|
||||
}
|
||||
|
||||
categorizers_.emplace_back(std::move(c));
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
categorizer_job categorizer_manager_<LoggerPolicy>::job(
|
||||
std::filesystem::path const& path) const {
|
||||
return categorizer_job(
|
||||
make_unique_logging_object<categorizer_job::impl, categorizer_job_,
|
||||
logger_policies>(lgr_, *this, path));
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
std::string_view
|
||||
categorizer_manager_<LoggerPolicy>::category_name(file_category c) const {
|
||||
return DWARFS_NOTHROW(categories_.at(c.value()));
|
||||
}
|
||||
|
||||
categorizer_manager::categorizer_manager(logger& lgr)
|
||||
: impl_(make_unique_logging_object<impl, categorizer_manager_,
|
||||
logger_policies>(lgr)) {}
|
||||
|
||||
categorizer_registry& categorizer_registry::instance() {
|
||||
static categorizer_registry the_instance;
|
||||
return the_instance;
|
||||
}
|
||||
|
||||
void categorizer_registry::register_factory(
|
||||
std::unique_ptr<categorizer_factory const>&& factory) {
|
||||
auto name = factory->name();
|
||||
|
||||
if (!factories_.emplace(name, std::move(factory)).second) {
|
||||
std::cerr << "categorizer factory name conflict (" << name << "\n";
|
||||
::abort();
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<categorizer>
|
||||
categorizer_registry::create(logger& lgr, std::string const& name,
|
||||
po::variables_map const& vm) const {
|
||||
auto it = factories_.find(name);
|
||||
|
||||
if (it == factories_.end()) {
|
||||
DWARFS_THROW(runtime_error, "unknown categorizer: " + name);
|
||||
}
|
||||
|
||||
return it->second->create(lgr, vm);
|
||||
}
|
||||
|
||||
void categorizer_registry::add_options(po::options_description& opts) const {
|
||||
for (auto& f : factories_) {
|
||||
if (auto f_opts = f.second->options()) {
|
||||
opts.add(*f_opts);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> categorizer_registry::categorizer_names() const {
|
||||
std::vector<std::string> rv;
|
||||
for (auto& f : factories_) {
|
||||
rv.emplace_back(f.first);
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
|
||||
categorizer_registry::categorizer_registry() = default;
|
||||
categorizer_registry::~categorizer_registry() = default;
|
||||
|
||||
} // namespace dwarfs
|
129
src/dwarfs/categorizer/binary_categorizer.cpp
Normal file
129
src/dwarfs/categorizer/binary_categorizer.cpp
Normal file
@ -0,0 +1,129 @@
|
||||
/* vim:set ts=2 sw=2 sts=2 et: */
|
||||
/**
|
||||
* \author Marcus Holland-Moritz (github@mhxnet.de)
|
||||
* \copyright Copyright (c) Marcus Holland-Moritz
|
||||
*
|
||||
* This file is part of dwarfs.
|
||||
*
|
||||
* dwarfs is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* dwarfs is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <array>
|
||||
#include <cstring>
|
||||
#include <map>
|
||||
#include <stack>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
#include <boost/program_options.hpp>
|
||||
|
||||
#include <fmt/format.h>
|
||||
|
||||
// TODO: this should obvs. work everywhere
|
||||
#ifndef _WIN32
|
||||
#include <elf.h>
|
||||
#endif
|
||||
|
||||
#include "dwarfs/categorizer.h"
|
||||
#include "dwarfs/error.h"
|
||||
#include "dwarfs/logger.h"
|
||||
|
||||
namespace dwarfs {
|
||||
|
||||
namespace po = boost::program_options;
|
||||
|
||||
namespace {
|
||||
|
||||
constexpr std::string_view const SOME_CATEGORY{"bla"};
|
||||
|
||||
class binary_categorizer_base : public random_access_categorizer {
|
||||
public:
|
||||
std::span<std::string_view const> categories() const override;
|
||||
};
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
class binary_categorizer_ final : public binary_categorizer_base {
|
||||
public:
|
||||
binary_categorizer_(logger& lgr)
|
||||
: LOG_PROXY_INIT(lgr) {}
|
||||
|
||||
std::optional<std::string_view>
|
||||
categorize(std::filesystem::path const& path,
|
||||
std::span<uint8_t const> data) const override;
|
||||
|
||||
private:
|
||||
LOG_PROXY_DECL(LoggerPolicy);
|
||||
};
|
||||
|
||||
std::span<std::string_view const> binary_categorizer_base::categories() const {
|
||||
static constexpr std::array const s_categories{
|
||||
SOME_CATEGORY,
|
||||
};
|
||||
return s_categories;
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
std::optional<std::string_view>
|
||||
binary_categorizer_<LoggerPolicy>::categorize(std::filesystem::path const&,
|
||||
std::span<uint8_t const> data
|
||||
[[maybe_unused]]) const {
|
||||
#ifndef _WIN32
|
||||
auto p = data.data();
|
||||
if (data.size() >= EI_NIDENT && ::memcmp(p, ELFMAG, 4) == 0) {
|
||||
switch (p[EI_OSABI]) {
|
||||
case ELFOSABI_SYSV: // 0 /* UNIX System V ABI */
|
||||
case ELFOSABI_HPUX: // 1 /* HP-UX */
|
||||
case ELFOSABI_NETBSD: // 2 /* NetBSD. */
|
||||
case ELFOSABI_GNU: // 3 /* Object uses GNU ELF extensions. */
|
||||
case ELFOSABI_SOLARIS: // 6 /* Sun Solaris. */
|
||||
case ELFOSABI_AIX: // 7 /* IBM AIX. */
|
||||
case ELFOSABI_IRIX: // 8 /* SGI Irix. */
|
||||
case ELFOSABI_FREEBSD: // 9 /* FreeBSD. */
|
||||
case ELFOSABI_TRU64: // 10 /* Compaq TRU64 UNIX. */
|
||||
case ELFOSABI_MODESTO: // 11 /* Novell Modesto. */
|
||||
case ELFOSABI_OPENBSD: // 12 /* OpenBSD. */
|
||||
case ELFOSABI_ARM_AEABI: // 64 /* ARM EABI */
|
||||
case ELFOSABI_ARM: // 97 /* ARM */
|
||||
case ELFOSABI_STANDALONE: // 255 /* Standalone (embedded) application */
|
||||
break;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
class binary_categorizer_factory : public categorizer_factory {
|
||||
public:
|
||||
std::string_view name() const override { return "binary"; }
|
||||
|
||||
std::shared_ptr<boost::program_options::options_description const>
|
||||
options() const override {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
std::unique_ptr<categorizer>
|
||||
create(logger& lgr, po::variables_map const& /*vm*/) const override {
|
||||
return make_unique_logging_object<categorizer, binary_categorizer_,
|
||||
logger_policies>(lgr);
|
||||
}
|
||||
|
||||
private:
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
REGISTER_CATEGORIZER_FACTORY(binary_categorizer_factory)
|
||||
|
||||
} // namespace dwarfs
|
232
src/dwarfs/categorizer/incompressible_categorizer.cpp
Normal file
232
src/dwarfs/categorizer/incompressible_categorizer.cpp
Normal file
@ -0,0 +1,232 @@
|
||||
/* vim:set ts=2 sw=2 sts=2 et: */
|
||||
/**
|
||||
* \author Marcus Holland-Moritz (github@mhxnet.de)
|
||||
* \copyright Copyright (c) Marcus Holland-Moritz
|
||||
*
|
||||
* This file is part of dwarfs.
|
||||
*
|
||||
* dwarfs is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* dwarfs is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <array>
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
|
||||
#include <boost/program_options.hpp>
|
||||
|
||||
#include <fmt/format.h>
|
||||
|
||||
#include <lz4.h>
|
||||
|
||||
#include "dwarfs/categorizer.h"
|
||||
#include "dwarfs/error.h"
|
||||
#include "dwarfs/logger.h"
|
||||
|
||||
namespace dwarfs {
|
||||
|
||||
namespace po = boost::program_options;
|
||||
|
||||
namespace {
|
||||
|
||||
constexpr std::string_view const INCOMPRESSIBLE_CATEGORY{"incompressible"};
|
||||
|
||||
struct incompressible_categorizer_config {
|
||||
size_t min_input_size;
|
||||
double max_ratio_size;
|
||||
double max_ratio_blocks;
|
||||
int lz4_acceleration;
|
||||
};
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
class incompressible_categorizer_job_ : public sequential_categorizer_job {
|
||||
public:
|
||||
static constexpr size_t const block_size{1024 * 1024};
|
||||
|
||||
incompressible_categorizer_job_(logger& lgr,
|
||||
incompressible_categorizer_config const& cfg,
|
||||
std::filesystem::path const& path,
|
||||
size_t total_size)
|
||||
: LOG_PROXY_INIT(lgr)
|
||||
, cfg_{cfg}
|
||||
, path_{path} {
|
||||
input_.reserve(total_size < block_size ? total_size : block_size);
|
||||
state_ = ::malloc(LZ4_sizeofState());
|
||||
}
|
||||
|
||||
~incompressible_categorizer_job_() { ::free(state_); }
|
||||
|
||||
void add(std::span<uint8_t const> data) override {
|
||||
while (!data.empty()) {
|
||||
auto part_size = input_.size() + data.size() <= block_size
|
||||
? data.size()
|
||||
: block_size - input_.size();
|
||||
add_input(data.first(part_size));
|
||||
data = data.subspan(part_size);
|
||||
}
|
||||
}
|
||||
|
||||
std::optional<std::string_view> result() override {
|
||||
if (!input_.empty()) {
|
||||
compress();
|
||||
}
|
||||
LOG_TRACE << path_ << " -> blocks: " << incompressible_blocks_ << "/"
|
||||
<< total_blocks_ << ", total compression ratio: "
|
||||
<< fmt::format("{:.2f}%",
|
||||
100.0 * total_output_size_ / total_input_size_);
|
||||
if (total_blocks_ > 0 &&
|
||||
(total_output_size_ >= cfg_.max_ratio_size * total_input_size_ ||
|
||||
incompressible_blocks_ >= cfg_.max_ratio_blocks * total_blocks_)) {
|
||||
return INCOMPRESSIBLE_CATEGORY;
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
private:
|
||||
void add_input(std::span<uint8_t const> data) {
|
||||
auto current_size = input_.size();
|
||||
assert(current_size + data.size() <= block_size);
|
||||
input_.resize(current_size + data.size());
|
||||
::memcpy(&input_[current_size], data.data(), data.size());
|
||||
if (input_.size() == block_size) {
|
||||
compress();
|
||||
}
|
||||
}
|
||||
|
||||
void compress() {
|
||||
total_input_size_ += input_.size();
|
||||
|
||||
output_.resize(::LZ4_compressBound(input_.size()));
|
||||
|
||||
auto rv = ::LZ4_compress_fast_extState(
|
||||
state_, reinterpret_cast<char*>(input_.data()),
|
||||
reinterpret_cast<char*>(output_.data()), input_.size(), output_.size(),
|
||||
cfg_.lz4_acceleration);
|
||||
|
||||
if (rv == 0) {
|
||||
DWARFS_THROW(runtime_error,
|
||||
"unexpected error in LZ4_compress_fast_extState");
|
||||
}
|
||||
|
||||
total_output_size_ += rv;
|
||||
++total_blocks_;
|
||||
|
||||
if (rv >= static_cast<int>(cfg_.max_ratio_size * input_.size())) {
|
||||
++incompressible_blocks_;
|
||||
}
|
||||
|
||||
input_.clear();
|
||||
}
|
||||
|
||||
LOG_PROXY_DECL(LoggerPolicy);
|
||||
void* state_;
|
||||
std::vector<uint8_t> input_;
|
||||
std::vector<uint8_t> output_;
|
||||
size_t total_input_size_{0};
|
||||
size_t total_output_size_{0};
|
||||
size_t total_blocks_{0};
|
||||
size_t incompressible_blocks_{0};
|
||||
incompressible_categorizer_config const& cfg_;
|
||||
std::filesystem::path const& path_;
|
||||
};
|
||||
|
||||
class incompressible_categorizer_ final : public sequential_categorizer {
|
||||
public:
|
||||
incompressible_categorizer_(logger& lgr,
|
||||
incompressible_categorizer_config const& cfg);
|
||||
|
||||
std::span<std::string_view const> categories() const override;
|
||||
std::unique_ptr<sequential_categorizer_job>
|
||||
job(std::filesystem::path const& path, size_t total_size) const override;
|
||||
|
||||
private:
|
||||
logger& lgr_;
|
||||
incompressible_categorizer_config const config_;
|
||||
};
|
||||
|
||||
incompressible_categorizer_::incompressible_categorizer_(
|
||||
logger& lgr, incompressible_categorizer_config const& cfg)
|
||||
: lgr_{lgr}
|
||||
, config_{cfg} {}
|
||||
|
||||
std::span<std::string_view const>
|
||||
incompressible_categorizer_::categories() const {
|
||||
static constexpr std::array const s_categories{
|
||||
INCOMPRESSIBLE_CATEGORY,
|
||||
};
|
||||
return s_categories;
|
||||
}
|
||||
|
||||
std::unique_ptr<sequential_categorizer_job>
|
||||
incompressible_categorizer_::job(std::filesystem::path const& path,
|
||||
size_t total_size) const {
|
||||
if (total_size < config_.min_input_size) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return make_unique_logging_object<sequential_categorizer_job,
|
||||
incompressible_categorizer_job_,
|
||||
logger_policies>(lgr_, config_, path,
|
||||
total_size);
|
||||
}
|
||||
|
||||
class incompressible_categorizer_factory : public categorizer_factory {
|
||||
public:
|
||||
incompressible_categorizer_factory()
|
||||
: opts_{std::make_shared<po::options_description>(
|
||||
"Incompressible categorizer options")} {
|
||||
static constexpr double const default_ratio{0.99};
|
||||
auto const default_ratio_str{fmt::format("{:.2f}", default_ratio)};
|
||||
// clang-format off
|
||||
opts_->add_options()
|
||||
("incompressible-min-input-size",
|
||||
po::value<size_t>(&cfg_.min_input_size)->default_value(256),
|
||||
"minimum file size in bytes to check for incompressibility")
|
||||
("incompressible-max-size-ratio",
|
||||
po::value<double>(&cfg_.max_ratio_size)
|
||||
->default_value(default_ratio, default_ratio_str),
|
||||
"LZ4 compression ratio above files are considered incompressible")
|
||||
("incompressible-max-blocks-ratio",
|
||||
po::value<double>(&cfg_.max_ratio_blocks)
|
||||
->default_value(default_ratio, default_ratio_str),
|
||||
"ratio of incompressible LZ4 blocks above which the whole file"
|
||||
" is considered incompressible")
|
||||
("incompressible-lz4-acceleration (1..65537)",
|
||||
po::value<int>(&cfg_.lz4_acceleration)->default_value(1),
|
||||
"LZ4 acceleration value")
|
||||
;
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
std::string_view name() const override { return "incompressible"; }
|
||||
|
||||
std::shared_ptr<po::options_description const> options() const override {
|
||||
return opts_;
|
||||
}
|
||||
|
||||
std::unique_ptr<categorizer>
|
||||
create(logger& lgr, po::variables_map const& /*vm*/) const override {
|
||||
return std::make_unique<incompressible_categorizer_>(lgr, cfg_);
|
||||
}
|
||||
|
||||
private:
|
||||
incompressible_categorizer_config cfg_;
|
||||
std::shared_ptr<po::options_description> opts_;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
REGISTER_CATEGORIZER_FACTORY(incompressible_categorizer_factory)
|
||||
|
||||
} // namespace dwarfs
|
198
src/dwarfs/categorizer/libmagic_categorizer.cpp
Normal file
198
src/dwarfs/categorizer/libmagic_categorizer.cpp
Normal file
@ -0,0 +1,198 @@
|
||||
/* vim:set ts=2 sw=2 sts=2 et: */
|
||||
/**
|
||||
* \author Marcus Holland-Moritz (github@mhxnet.de)
|
||||
* \copyright Copyright (c) Marcus Holland-Moritz
|
||||
*
|
||||
* This file is part of dwarfs.
|
||||
*
|
||||
* dwarfs is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* dwarfs is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <array>
|
||||
#include <map>
|
||||
#include <stack>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
#include <boost/program_options.hpp>
|
||||
|
||||
#include <fmt/format.h>
|
||||
|
||||
#include <folly/Synchronized.h>
|
||||
|
||||
#include <magic.h>
|
||||
|
||||
#include "dwarfs/categorizer.h"
|
||||
#include "dwarfs/error.h"
|
||||
#include "dwarfs/logger.h"
|
||||
|
||||
namespace dwarfs {
|
||||
|
||||
namespace {
|
||||
|
||||
namespace po = boost::program_options;
|
||||
|
||||
constexpr std::string_view const SOME_CATEGORY{"bla"};
|
||||
|
||||
std::unordered_set<std::string_view> executable_mime_types{
|
||||
"application/x-executable",
|
||||
"application/x-sharedlib",
|
||||
};
|
||||
|
||||
class magic_wrapper {
|
||||
public:
|
||||
magic_wrapper() = default;
|
||||
|
||||
size_t cookie_count() const {
|
||||
auto rlock = cookies_.rlock();
|
||||
return rlock->size();
|
||||
}
|
||||
|
||||
std::string identify(std::span<uint8_t const> data) const {
|
||||
std::string rv;
|
||||
scoped_cookie m(*this);
|
||||
if (auto id = ::magic_buffer(m.get(), data.data(), data.size())) {
|
||||
rv.assign(id);
|
||||
}
|
||||
if (rv.starts_with("application/")) {
|
||||
::magic_setflags(m.get(), MAGIC_NONE);
|
||||
if (auto id = ::magic_buffer(m.get(), data.data(), data.size())) {
|
||||
rv += "; " + std::string(id);
|
||||
}
|
||||
::magic_setflags(m.get(), MAGIC_MIME_TYPE);
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
|
||||
private:
|
||||
using magic_cookie_t =
|
||||
std::unique_ptr<struct ::magic_set, decltype(&::magic_close)>;
|
||||
|
||||
magic_cookie_t new_cookie() const {
|
||||
magic_cookie_t m(::magic_open(MAGIC_MIME_TYPE), &::magic_close);
|
||||
if (!m) {
|
||||
throw std::runtime_error("could not create magic cookie");
|
||||
}
|
||||
if (::magic_load(m.get(), NULL) != 0) {
|
||||
throw std::runtime_error(
|
||||
fmt::format("(magic) {}", ::magic_error(m.get())));
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
class scoped_cookie {
|
||||
public:
|
||||
scoped_cookie(magic_wrapper const& w)
|
||||
: cookie_{get_scoped_cookie(w)}
|
||||
, w_{w} {}
|
||||
|
||||
~scoped_cookie() {
|
||||
auto wlock = w_.cookies_.wlock();
|
||||
wlock->push(std::move(cookie_));
|
||||
}
|
||||
|
||||
::magic_t get() const { return cookie_.get(); }
|
||||
|
||||
private:
|
||||
static magic_cookie_t get_scoped_cookie(magic_wrapper const& w) {
|
||||
auto wlock = w.cookies_.wlock();
|
||||
if (wlock->empty()) [[unlikely]] {
|
||||
return w.new_cookie();
|
||||
}
|
||||
auto cookie = std::move(wlock->top());
|
||||
wlock->pop();
|
||||
return cookie;
|
||||
}
|
||||
|
||||
magic_cookie_t cookie_;
|
||||
magic_wrapper const& w_;
|
||||
};
|
||||
|
||||
mutable folly::Synchronized<std::stack<magic_cookie_t>> cookies_;
|
||||
};
|
||||
|
||||
class libmagic_categorizer_base : public random_access_categorizer {
|
||||
public:
|
||||
std::span<std::string_view const> categories() const override;
|
||||
};
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
class libmagic_categorizer_ final : public libmagic_categorizer_base {
|
||||
public:
|
||||
explicit libmagic_categorizer_(logger& lgr)
|
||||
: LOG_PROXY_INIT(lgr) {}
|
||||
|
||||
~libmagic_categorizer_() {
|
||||
LOG_INFO << m_.cookie_count() << " magic cookies were used";
|
||||
{
|
||||
auto rlock = mimetypes_.rlock();
|
||||
for (auto const& [k, v] : *rlock) {
|
||||
LOG_INFO << k << " -> " << v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::optional<std::string_view>
|
||||
categorize(std::filesystem::path const& path,
|
||||
std::span<uint8_t const> data) const override;
|
||||
|
||||
private:
|
||||
LOG_PROXY_DECL(LoggerPolicy);
|
||||
magic_wrapper m_;
|
||||
mutable folly::Synchronized<std::map<std::string, size_t>> mimetypes_;
|
||||
};
|
||||
|
||||
std::span<std::string_view const>
|
||||
libmagic_categorizer_base::categories() const {
|
||||
static constexpr std::array const s_categories{
|
||||
SOME_CATEGORY,
|
||||
};
|
||||
return s_categories;
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
std::optional<std::string_view> libmagic_categorizer_<LoggerPolicy>::categorize(
|
||||
std::filesystem::path const& path, std::span<uint8_t const> data) const {
|
||||
auto id = m_.identify(data);
|
||||
LOG_DEBUG << path << " -> (magic) " << id;
|
||||
{
|
||||
auto wlock = mimetypes_.wlock();
|
||||
++(*wlock)[id];
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
class libmagic_categorizer_factory : public categorizer_factory {
|
||||
public:
|
||||
std::string_view name() const override { return "libmagic"; }
|
||||
|
||||
std::shared_ptr<boost::program_options::options_description const>
|
||||
options() const override {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
std::unique_ptr<categorizer>
|
||||
create(logger& lgr, po::variables_map const& /*vm*/) const override {
|
||||
return make_unique_logging_object<categorizer, libmagic_categorizer_,
|
||||
logger_policies>(lgr);
|
||||
}
|
||||
|
||||
private:
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
REGISTER_CATEGORIZER_FACTORY(libmagic_categorizer_factory)
|
||||
|
||||
} // namespace dwarfs
|
@ -29,10 +29,12 @@
|
||||
#include <limits>
|
||||
#include <numeric>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include <fmt/format.h>
|
||||
|
||||
#include "dwarfs/categorizer.h"
|
||||
#include "dwarfs/compiler.h"
|
||||
#include "dwarfs/entry.h"
|
||||
#include "dwarfs/error.h"
|
||||
@ -144,8 +146,19 @@ class inode_ : public inode {
|
||||
similarity sc;
|
||||
nilsimsa nc;
|
||||
|
||||
categorizer_job catjob;
|
||||
|
||||
if (opts.categorizer_mgr) {
|
||||
catjob =
|
||||
opts.categorizer_mgr->job(mm ? mm->path().string() : "<no-file>");
|
||||
}
|
||||
|
||||
if (mm) {
|
||||
auto update_hashes = [&](uint8_t const* data, size_t size) {
|
||||
if (catjob) {
|
||||
catjob.categorize_random_access(mm->span());
|
||||
}
|
||||
|
||||
auto scan_sequential = [&](uint8_t const* data, size_t size) {
|
||||
if (opts.with_similarity) {
|
||||
sc.update(data, size);
|
||||
}
|
||||
@ -153,6 +166,10 @@ class inode_ : public inode {
|
||||
if (opts.with_nilsimsa) {
|
||||
nc.update(data, size);
|
||||
}
|
||||
|
||||
if (catjob) {
|
||||
catjob.categorize_sequential(std::span(data, size));
|
||||
}
|
||||
};
|
||||
|
||||
constexpr size_t chunk_size = 32 << 20;
|
||||
@ -160,13 +177,13 @@ class inode_ : public inode {
|
||||
size_t size = mm->size();
|
||||
|
||||
while (size >= chunk_size) {
|
||||
update_hashes(mm->as<uint8_t>(offset), chunk_size);
|
||||
scan_sequential(mm->as<uint8_t>(offset), chunk_size);
|
||||
mm->release_until(offset);
|
||||
offset += chunk_size;
|
||||
size -= chunk_size;
|
||||
}
|
||||
|
||||
update_hashes(mm->as<uint8_t>(offset), size);
|
||||
scan_sequential(mm->as<uint8_t>(offset), size);
|
||||
}
|
||||
|
||||
if (opts.with_similarity) {
|
||||
@ -182,6 +199,10 @@ class inode_ : public inode {
|
||||
nilsimsa_valid_ = true;
|
||||
#endif
|
||||
}
|
||||
|
||||
if (catjob) {
|
||||
category_ = catjob.result();
|
||||
}
|
||||
}
|
||||
|
||||
void add_chunk(size_t block, size_t offset, size_t size) override {
|
||||
@ -207,9 +228,12 @@ class inode_ : public inode {
|
||||
vec.insert(vec.end(), chunks_.begin(), chunks_.end());
|
||||
}
|
||||
|
||||
file_category category() const override { return category_; }
|
||||
|
||||
private:
|
||||
std::optional<uint32_t> num_;
|
||||
uint32_t similarity_hash_{0};
|
||||
file_category category_;
|
||||
files_vector files_;
|
||||
std::vector<chunk_type> chunks_;
|
||||
nilsimsa::hash_type nilsimsa_similarity_hash_;
|
||||
@ -254,6 +278,27 @@ class inode_manager_ final : public inode_manager::impl {
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::pair<file_category, size_t>>
|
||||
category_counts() const override {
|
||||
std::unordered_map<file_category::value_type, size_t> tmp;
|
||||
|
||||
for (auto const& i : inodes_) {
|
||||
++tmp[i->category().value()];
|
||||
}
|
||||
|
||||
std::vector<std::pair<file_category, size_t>> rv;
|
||||
|
||||
for (auto const& [k, v] : tmp) {
|
||||
rv.emplace_back(k, v);
|
||||
}
|
||||
|
||||
std::sort(rv.begin(), rv.end(), [](auto const& a, auto const& b) {
|
||||
return a.first.value() < b.first.value();
|
||||
});
|
||||
|
||||
return rv;
|
||||
}
|
||||
|
||||
private:
|
||||
void order_inodes_by_path() {
|
||||
std::vector<std::string> paths;
|
||||
|
@ -37,6 +37,7 @@
|
||||
#include <fmt/format.h>
|
||||
|
||||
#include "dwarfs/block_data.h"
|
||||
#include "dwarfs/categorizer.h"
|
||||
#include "dwarfs/entry.h"
|
||||
#include "dwarfs/error.h"
|
||||
#include "dwarfs/file_scanner.h"
|
||||
@ -606,6 +607,14 @@ void scanner_<LoggerPolicy>::scan(
|
||||
<< prog.duplicate_files << "/" << prog.files_found
|
||||
<< " duplicate files";
|
||||
|
||||
if (options_.inode.categorizer_mgr) {
|
||||
for (auto const& cc : im.category_counts()) {
|
||||
LOG_INFO << cc.second << " "
|
||||
<< options_.inode.categorizer_mgr->category_name(cc.first)
|
||||
<< " files";
|
||||
}
|
||||
}
|
||||
|
||||
global_entry_data ge_data(options_);
|
||||
thrift::metadata::metadata mv2;
|
||||
|
||||
|
@ -53,6 +53,7 @@
|
||||
#include "dwarfs/block_compressor.h"
|
||||
#include "dwarfs/block_manager.h"
|
||||
#include "dwarfs/builtin_script.h"
|
||||
#include "dwarfs/categorizer.h"
|
||||
#include "dwarfs/chmod_transformer.h"
|
||||
#include "dwarfs/console_writer.h"
|
||||
#include "dwarfs/entry.h"
|
||||
@ -318,7 +319,7 @@ int mkdwarfs_main(int argc, sys_char** argv) {
|
||||
std::vector<sys_string> filter;
|
||||
size_t num_workers, num_scanner_workers;
|
||||
bool no_progress = false, remove_header = false, no_section_index = false,
|
||||
force_overwrite = false;
|
||||
force_overwrite = false, enable_categorizer = false;
|
||||
unsigned level;
|
||||
int compress_niceness;
|
||||
uint16_t uid, gid;
|
||||
@ -391,6 +392,9 @@ int mkdwarfs_main(int argc, sys_char** argv) {
|
||||
("recompress",
|
||||
po::value<std::string>(&recompress_opts)->implicit_value("all"),
|
||||
"recompress an existing filesystem (none, block, metadata, all)")
|
||||
("categorize",
|
||||
po::value<bool>(&enable_categorizer)->zero_tokens(),
|
||||
"WIP enable categorizer")
|
||||
("order",
|
||||
po::value<std::string>(&order),
|
||||
order_desc.c_str())
|
||||
@ -510,6 +514,9 @@ int mkdwarfs_main(int argc, sys_char** argv) {
|
||||
.add(filesystem_opts)
|
||||
.add(metadata_opts);
|
||||
|
||||
auto& catreg = categorizer_registry::instance();
|
||||
catreg.add_options(opts);
|
||||
|
||||
po::variables_map vm;
|
||||
|
||||
auto& sys_err_out = SYS_CERR;
|
||||
@ -1021,6 +1028,14 @@ int mkdwarfs_main(int argc, sys_char** argv) {
|
||||
options.file_order.mode == file_order_mode::SIMILARITY;
|
||||
options.inode.with_nilsimsa =
|
||||
options.file_order.mode == file_order_mode::NILSIMSA;
|
||||
if (enable_categorizer) {
|
||||
options.inode.categorizer_mgr =
|
||||
std::make_shared<categorizer_manager>(lgr);
|
||||
// TODO
|
||||
for (auto const& name : catreg.categorizer_names()) {
|
||||
options.inode.categorizer_mgr->add(catreg.create(lgr, name, vm));
|
||||
}
|
||||
}
|
||||
|
||||
scanner s(lgr, wg_scanner, cfg, entry_factory::create(),
|
||||
std::make_shared<os_access_generic>(), std::move(script),
|
||||
@ -1031,6 +1046,8 @@ int mkdwarfs_main(int argc, sys_char** argv) {
|
||||
} else {
|
||||
s.scan(fsw, path, prog);
|
||||
}
|
||||
|
||||
options.inode.categorizer_mgr.reset();
|
||||
}
|
||||
} catch (runtime_error const& e) {
|
||||
LOG_ERROR << e.what();
|
||||
|
@ -17,6 +17,7 @@
|
||||
"glog",
|
||||
"libarchive",
|
||||
"libevent",
|
||||
"libmagic",
|
||||
"openssl",
|
||||
"pkgconf",
|
||||
"utfcpp",
|
||||
|
Loading…
x
Reference in New Issue
Block a user