mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-14 06:48:39 -04:00
Initial categorizer implementation
This commit is contained in:
parent
4e0d2ba25e
commit
1ac36bb6fa
@ -212,6 +212,7 @@ if(PKG_CONFIG_FOUND)
|
|||||||
pkg_check_modules(LIBBROTLIDEC IMPORTED_TARGET libbrotlidec>=1.0.9)
|
pkg_check_modules(LIBBROTLIDEC IMPORTED_TARGET libbrotlidec>=1.0.9)
|
||||||
pkg_check_modules(LIBBROTLIENC IMPORTED_TARGET libbrotlienc>=1.0.9)
|
pkg_check_modules(LIBBROTLIENC IMPORTED_TARGET libbrotlienc>=1.0.9)
|
||||||
pkg_check_modules(LIBARCHIVE IMPORTED_TARGET libarchive>=3.6.0)
|
pkg_check_modules(LIBARCHIVE IMPORTED_TARGET libarchive>=3.6.0)
|
||||||
|
pkg_check_modules(LIBMAGIC IMPORTED_TARGET libmagic>=5.38)
|
||||||
pkg_check_modules(ZSTD IMPORTED_TARGET libzstd>=1.5.2)
|
pkg_check_modules(ZSTD IMPORTED_TARGET libzstd>=1.5.2)
|
||||||
pkg_check_modules(XXHASH IMPORTED_TARGET libxxhash>=0.8.1)
|
pkg_check_modules(XXHASH IMPORTED_TARGET libxxhash>=0.8.1)
|
||||||
endif()
|
endif()
|
||||||
@ -356,6 +357,7 @@ list(
|
|||||||
src/dwarfs/block_range.cpp
|
src/dwarfs/block_range.cpp
|
||||||
src/dwarfs/builtin_script.cpp
|
src/dwarfs/builtin_script.cpp
|
||||||
src/dwarfs/cached_block.cpp
|
src/dwarfs/cached_block.cpp
|
||||||
|
src/dwarfs/categorizer.cpp
|
||||||
src/dwarfs/checksum.cpp
|
src/dwarfs/checksum.cpp
|
||||||
src/dwarfs/chmod_transformer.cpp
|
src/dwarfs/chmod_transformer.cpp
|
||||||
src/dwarfs/console_writer.cpp
|
src/dwarfs/console_writer.cpp
|
||||||
@ -413,8 +415,20 @@ if(LIBBROTLIDEC_FOUND AND LIBBROTLIENC_FOUND)
|
|||||||
list(APPEND LIBDWARFS_COMPRESSION_SRC src/dwarfs/compression/brotli.cpp)
|
list(APPEND LIBDWARFS_COMPRESSION_SRC src/dwarfs/compression/brotli.cpp)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
list(
|
||||||
|
APPEND
|
||||||
|
LIBDWARFS_CATEGORIZER_SRC
|
||||||
|
src/dwarfs/categorizer/binary_categorizer.cpp
|
||||||
|
src/dwarfs/categorizer/incompressible_categorizer.cpp
|
||||||
|
)
|
||||||
|
|
||||||
|
if(LIBMAGIC_FOUND)
|
||||||
|
list(APPEND LIBDWARFS_CATEGORIZER_SRC src/dwarfs/categorizer/libmagic_categorizer.cpp)
|
||||||
|
endif()
|
||||||
|
|
||||||
add_library(dwarfs ${LIBDWARFS_SRC})
|
add_library(dwarfs ${LIBDWARFS_SRC})
|
||||||
add_library(dwarfs_compression ${LIBDWARFS_COMPRESSION_SRC})
|
add_library(dwarfs_compression ${LIBDWARFS_COMPRESSION_SRC})
|
||||||
|
add_library(dwarfs_categorizer ${LIBDWARFS_CATEGORIZER_SRC})
|
||||||
add_library(dwarfs_tool src/dwarfs/tool.cpp)
|
add_library(dwarfs_tool src/dwarfs/tool.cpp)
|
||||||
|
|
||||||
if(DWARFS_GIT_BUILD)
|
if(DWARFS_GIT_BUILD)
|
||||||
@ -425,6 +439,7 @@ target_compile_definitions(
|
|||||||
dwarfs_tool PRIVATE PRJ_BUILD_ID="${CMAKE_SYSTEM_PROCESSOR}, ${CMAKE_SYSTEM}, ${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}"
|
dwarfs_tool PRIVATE PRJ_BUILD_ID="${CMAKE_SYSTEM_PROCESSOR}, ${CMAKE_SYSTEM}, ${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
target_link_libraries(dwarfs_categorizer folly)
|
||||||
target_link_libraries(dwarfs_compression folly)
|
target_link_libraries(dwarfs_compression folly)
|
||||||
target_link_libraries(dwarfs_tool dwarfs)
|
target_link_libraries(dwarfs_tool dwarfs)
|
||||||
|
|
||||||
@ -719,7 +734,8 @@ target_include_directories(metadata_thrift PRIVATE ${INCLUDE_DIRS})
|
|||||||
|
|
||||||
target_link_libraries(metadata_thrift thrift_light)
|
target_link_libraries(metadata_thrift thrift_light)
|
||||||
|
|
||||||
foreach(tgt dwarfs dwarfs_compression dwarfs_tool ${BINARY_TARGETS} ${MAIN_TARGETS})
|
foreach(tgt dwarfs dwarfs_compression dwarfs_categorizer
|
||||||
|
dwarfs_tool ${BINARY_TARGETS} ${MAIN_TARGETS})
|
||||||
target_include_directories(
|
target_include_directories(
|
||||||
${tgt} SYSTEM
|
${tgt} SYSTEM
|
||||||
PRIVATE ${Boost_INCLUDE_DIRS} ${Python3_INCLUDE_DIRS} ${INCLUDE_DIRS}
|
PRIVATE ${Boost_INCLUDE_DIRS} ${Python3_INCLUDE_DIRS} ${INCLUDE_DIRS}
|
||||||
@ -732,6 +748,7 @@ foreach(tgt dwarfs dwarfs_compression dwarfs_tool ${BINARY_TARGETS} ${MAIN_TARGE
|
|||||||
PRIVATE DWARFS_HAVE_LIBZSTD
|
PRIVATE DWARFS_HAVE_LIBZSTD
|
||||||
DWARFS_STATIC_BUILD=${STATIC_BUILD_DO_NOT_USE}
|
DWARFS_STATIC_BUILD=${STATIC_BUILD_DO_NOT_USE}
|
||||||
$<$<BOOL:${USE_JEMALLOC}>:DWARFS_USE_JEMALLOC>
|
$<$<BOOL:${USE_JEMALLOC}>:DWARFS_USE_JEMALLOC>
|
||||||
|
$<$<BOOL:${LIBMAGIC_FOUND}>:DWARFS_HAVE_LIBMAGIC>
|
||||||
$<$<BOOL:${LIBLZ4_FOUND}>:DWARFS_HAVE_LIBLZ4>
|
$<$<BOOL:${LIBLZ4_FOUND}>:DWARFS_HAVE_LIBLZ4>
|
||||||
$<$<BOOL:${LIBLZMA_FOUND}>:DWARFS_HAVE_LIBLZMA>
|
$<$<BOOL:${LIBLZMA_FOUND}>:DWARFS_HAVE_LIBLZMA>
|
||||||
$<$<AND:$<BOOL:${LIBBROTLIDEC_FOUND}>,$<BOOL:${LIBBROTLIENC_FOUND}>>:DWARFS_HAVE_LIBBROTLI>
|
$<$<AND:$<BOOL:${LIBBROTLIDEC_FOUND}>,$<BOOL:${LIBBROTLIENC_FOUND}>>:DWARFS_HAVE_LIBBROTLI>
|
||||||
@ -809,6 +826,10 @@ target_link_libraries(
|
|||||||
fsst
|
fsst
|
||||||
${Boost_LIBRARIES})
|
${Boost_LIBRARIES})
|
||||||
|
|
||||||
|
if(LIBMAGIC_FOUND)
|
||||||
|
target_link_libraries(dwarfs PkgConfig::LIBMAGIC)
|
||||||
|
endif()
|
||||||
|
|
||||||
if(LIBLZ4_FOUND)
|
if(LIBLZ4_FOUND)
|
||||||
target_link_libraries(dwarfs PkgConfig::LIBLZ4)
|
target_link_libraries(dwarfs PkgConfig::LIBLZ4)
|
||||||
endif()
|
endif()
|
||||||
@ -823,6 +844,7 @@ endif()
|
|||||||
|
|
||||||
if(NOT STATIC_BUILD_DO_NOT_USE)
|
if(NOT STATIC_BUILD_DO_NOT_USE)
|
||||||
target_link_libraries(dwarfs PkgConfig::LIBARCHIVE)
|
target_link_libraries(dwarfs PkgConfig::LIBARCHIVE)
|
||||||
|
target_link_libraries(dwarfs_categorizer PkgConfig::LIBMAGIC)
|
||||||
endif(NOT STATIC_BUILD_DO_NOT_USE)
|
endif(NOT STATIC_BUILD_DO_NOT_USE)
|
||||||
|
|
||||||
if(ZSTD_FOUND AND PREFER_SYSTEM_ZSTD)
|
if(ZSTD_FOUND AND PREFER_SYSTEM_ZSTD)
|
||||||
@ -850,6 +872,7 @@ foreach(tgt ${BINARY_TARGETS} ${MAIN_TARGETS})
|
|||||||
endif()
|
endif()
|
||||||
endforeach()
|
endforeach()
|
||||||
|
|
||||||
|
target_link_libraries(mkdwarfs_main "$<LINK_LIBRARY:WHOLE_ARCHIVE,dwarfs_categorizer>")
|
||||||
|
|
||||||
if(STATIC_BUILD_DO_NOT_USE)
|
if(STATIC_BUILD_DO_NOT_USE)
|
||||||
# ...................................................................
|
# ...................................................................
|
||||||
@ -883,6 +906,7 @@ if(STATIC_BUILD_DO_NOT_USE)
|
|||||||
import_static_lib(static_libssl "libssl.a")
|
import_static_lib(static_libssl "libssl.a")
|
||||||
import_static_lib(static_libunwind "libunwind.a")
|
import_static_lib(static_libunwind "libunwind.a")
|
||||||
import_static_lib(static_libarchive "libarchive.a")
|
import_static_lib(static_libarchive "libarchive.a")
|
||||||
|
import_static_lib(static_libmagic "libmagic.a")
|
||||||
|
|
||||||
set_target_properties(static_libunwind PROPERTIES INTERFACE_LINK_LIBRARIES
|
set_target_properties(static_libunwind PROPERTIES INTERFACE_LINK_LIBRARIES
|
||||||
PkgConfig::LIBLZMA)
|
PkgConfig::LIBLZMA)
|
||||||
@ -890,7 +914,10 @@ if(STATIC_BUILD_DO_NOT_USE)
|
|||||||
static_libgflags)
|
static_libgflags)
|
||||||
set_target_properties(static_librt PROPERTIES INTERFACE_LINK_LIBRARIES
|
set_target_properties(static_librt PROPERTIES INTERFACE_LINK_LIBRARIES
|
||||||
static_libgflags)
|
static_libgflags)
|
||||||
|
set_target_properties(static_libmagic PROPERTIES INTERFACE_LINK_LIBRARIES
|
||||||
|
static_libz)
|
||||||
|
|
||||||
|
target_link_libraries(dwarfs_categorizer static_libmagic)
|
||||||
|
|
||||||
foreach(tgt ${BINARY_TARGETS})
|
foreach(tgt ${BINARY_TARGETS})
|
||||||
if(PREFER_SYSTEM_LIBFMT)
|
if(PREFER_SYSTEM_LIBFMT)
|
||||||
|
197
include/dwarfs/categorizer.h
Normal file
197
include/dwarfs/categorizer.h
Normal file
@ -0,0 +1,197 @@
|
|||||||
|
/* vim:set ts=2 sw=2 sts=2 et: */
|
||||||
|
/**
|
||||||
|
* \author Marcus Holland-Moritz (github@mhxnet.de)
|
||||||
|
* \copyright Copyright (c) Marcus Holland-Moritz
|
||||||
|
*
|
||||||
|
* This file is part of dwarfs.
|
||||||
|
*
|
||||||
|
* dwarfs is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* dwarfs is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
|
#include <filesystem>
|
||||||
|
#include <limits>
|
||||||
|
#include <map>
|
||||||
|
#include <memory>
|
||||||
|
#include <optional>
|
||||||
|
#include <span>
|
||||||
|
#include <string_view>
|
||||||
|
|
||||||
|
#include "dwarfs/file_category.h"
|
||||||
|
|
||||||
|
namespace boost::program_options {
|
||||||
|
class options_description;
|
||||||
|
class variables_map;
|
||||||
|
} // namespace boost::program_options
|
||||||
|
|
||||||
|
namespace dwarfs {
|
||||||
|
|
||||||
|
class logger;
|
||||||
|
|
||||||
|
class categorizer {
|
||||||
|
public:
|
||||||
|
virtual ~categorizer() = default;
|
||||||
|
|
||||||
|
virtual std::span<std::string_view const> categories() const = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
class random_access_categorizer : public categorizer {
|
||||||
|
public:
|
||||||
|
virtual std::optional<std::string_view>
|
||||||
|
categorize(std::filesystem::path const& path,
|
||||||
|
std::span<uint8_t const> data) const = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
class sequential_categorizer_job {
|
||||||
|
public:
|
||||||
|
virtual ~sequential_categorizer_job() = default;
|
||||||
|
|
||||||
|
virtual void add(std::span<uint8_t const> data) = 0;
|
||||||
|
virtual std::optional<std::string_view> result() = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
class sequential_categorizer : public categorizer {
|
||||||
|
public:
|
||||||
|
virtual std::unique_ptr<sequential_categorizer_job>
|
||||||
|
job(std::filesystem::path const& path, size_t total_size) const = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
class categorizer_job {
|
||||||
|
public:
|
||||||
|
class impl;
|
||||||
|
|
||||||
|
categorizer_job();
|
||||||
|
categorizer_job(std::unique_ptr<impl> impl);
|
||||||
|
|
||||||
|
void categorize_random_access(std::span<uint8_t const> data) {
|
||||||
|
return impl_->categorize_random_access(data);
|
||||||
|
}
|
||||||
|
|
||||||
|
void categorize_sequential(std::span<uint8_t const> data) {
|
||||||
|
return impl_->categorize_sequential(data);
|
||||||
|
}
|
||||||
|
|
||||||
|
file_category result() { return impl_->result(); }
|
||||||
|
|
||||||
|
explicit operator bool() const { return impl_ != nullptr; }
|
||||||
|
|
||||||
|
class impl {
|
||||||
|
public:
|
||||||
|
virtual ~impl() = default;
|
||||||
|
|
||||||
|
virtual void categorize_random_access(std::span<uint8_t const> data) = 0;
|
||||||
|
virtual void categorize_sequential(std::span<uint8_t const> data) = 0;
|
||||||
|
virtual file_category result() = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::unique_ptr<impl> impl_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class categorizer_manager {
|
||||||
|
public:
|
||||||
|
categorizer_manager(logger& lgr);
|
||||||
|
|
||||||
|
void add(std::shared_ptr<categorizer const> c) { impl_->add(std::move(c)); }
|
||||||
|
|
||||||
|
categorizer_job job(std::filesystem::path const& path) const {
|
||||||
|
return impl_->job(path);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string_view category_name(file_category c) const {
|
||||||
|
return impl_->category_name(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
class impl {
|
||||||
|
public:
|
||||||
|
virtual ~impl() = default;
|
||||||
|
|
||||||
|
virtual void add(std::shared_ptr<categorizer const> c) = 0;
|
||||||
|
virtual categorizer_job job(std::filesystem::path const& path) const = 0;
|
||||||
|
virtual std::string_view category_name(file_category c) const = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::unique_ptr<impl> impl_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class categorizer_info {
|
||||||
|
public:
|
||||||
|
virtual ~categorizer_info() = default;
|
||||||
|
|
||||||
|
virtual std::string_view name() const = 0;
|
||||||
|
virtual std::shared_ptr<boost::program_options::options_description const>
|
||||||
|
options() const = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
class categorizer_factory : public categorizer_info {
|
||||||
|
public:
|
||||||
|
virtual std::unique_ptr<categorizer>
|
||||||
|
create(logger& lgr,
|
||||||
|
boost::program_options::variables_map const& vm) const = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
namespace detail {
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
class categorizer_factory_registrar {
|
||||||
|
public:
|
||||||
|
categorizer_factory_registrar();
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace detail
|
||||||
|
|
||||||
|
class categorizer_registry {
|
||||||
|
public:
|
||||||
|
template <typename T>
|
||||||
|
friend class detail::categorizer_factory_registrar;
|
||||||
|
|
||||||
|
static categorizer_registry& instance();
|
||||||
|
|
||||||
|
std::unique_ptr<categorizer>
|
||||||
|
create(logger& lgr, std::string const& name,
|
||||||
|
boost::program_options::variables_map const& vm) const;
|
||||||
|
|
||||||
|
void add_options(boost::program_options::options_description& opts) const;
|
||||||
|
|
||||||
|
std::vector<std::string> categorizer_names() const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
categorizer_registry();
|
||||||
|
~categorizer_registry();
|
||||||
|
|
||||||
|
void register_factory(std::unique_ptr<categorizer_factory const>&& factory);
|
||||||
|
|
||||||
|
std::map<std::string, std::unique_ptr<categorizer_factory const>> factories_;
|
||||||
|
};
|
||||||
|
|
||||||
|
namespace detail {
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
categorizer_factory_registrar<T>::categorizer_factory_registrar() {
|
||||||
|
::dwarfs::categorizer_registry::instance().register_factory(
|
||||||
|
std::make_unique<T>());
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace detail
|
||||||
|
|
||||||
|
#define REGISTER_CATEGORIZER_FACTORY(factory) \
|
||||||
|
namespace { \
|
||||||
|
::dwarfs::detail::categorizer_factory_registrar<factory> \
|
||||||
|
the_##factory##_registrar; \
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace dwarfs
|
72
include/dwarfs/file_category.h
Normal file
72
include/dwarfs/file_category.h
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
/* vim:set ts=2 sw=2 sts=2 et: */
|
||||||
|
/**
|
||||||
|
* \author Marcus Holland-Moritz (github@mhxnet.de)
|
||||||
|
* \copyright Copyright (c) Marcus Holland-Moritz
|
||||||
|
*
|
||||||
|
* This file is part of dwarfs.
|
||||||
|
*
|
||||||
|
* dwarfs is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* dwarfs is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
|
#include <limits>
|
||||||
|
|
||||||
|
namespace dwarfs {
|
||||||
|
|
||||||
|
class file_category {
|
||||||
|
public:
|
||||||
|
using value_type = uint32_t;
|
||||||
|
|
||||||
|
static constexpr value_type const uninitialized{
|
||||||
|
std::numeric_limits<value_type>::max()};
|
||||||
|
static constexpr value_type const min{0};
|
||||||
|
static constexpr value_type const max{std::numeric_limits<value_type>::max() -
|
||||||
|
1};
|
||||||
|
|
||||||
|
file_category()
|
||||||
|
: value_{uninitialized} {}
|
||||||
|
file_category(value_type v)
|
||||||
|
: value_{v} {}
|
||||||
|
|
||||||
|
file_category(file_category const&) = default;
|
||||||
|
file_category(file_category&&) = default;
|
||||||
|
|
||||||
|
file_category& operator=(file_category const&) = default;
|
||||||
|
file_category& operator=(file_category&&) = default;
|
||||||
|
|
||||||
|
file_category& operator=(value_type v) {
|
||||||
|
value_ = v;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
value_type value() const {
|
||||||
|
if (empty()) {
|
||||||
|
throw std::range_error("file_category is uninitialized");
|
||||||
|
}
|
||||||
|
return value_;
|
||||||
|
}
|
||||||
|
|
||||||
|
void clear() { value_ = uninitialized; }
|
||||||
|
|
||||||
|
bool empty() const { return value_ == uninitialized; }
|
||||||
|
|
||||||
|
explicit operator bool() const { return !empty(); }
|
||||||
|
|
||||||
|
private:
|
||||||
|
value_type value_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace dwarfs
|
@ -27,6 +27,7 @@
|
|||||||
|
|
||||||
#include <folly/small_vector.h>
|
#include <folly/small_vector.h>
|
||||||
|
|
||||||
|
#include "dwarfs/file_category.h"
|
||||||
#include "dwarfs/nilsimsa.h"
|
#include "dwarfs/nilsimsa.h"
|
||||||
#include "dwarfs/object.h"
|
#include "dwarfs/object.h"
|
||||||
|
|
||||||
@ -58,6 +59,7 @@ class inode : public object {
|
|||||||
virtual void add_chunk(size_t block, size_t offset, size_t size) = 0;
|
virtual void add_chunk(size_t block, size_t offset, size_t size) = 0;
|
||||||
virtual void
|
virtual void
|
||||||
append_chunks_to(std::vector<thrift::metadata::chunk>& vec) const = 0;
|
append_chunks_to(std::vector<thrift::metadata::chunk>& vec) const = 0;
|
||||||
|
virtual file_category category() const = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace dwarfs
|
} // namespace dwarfs
|
||||||
|
@ -24,6 +24,11 @@
|
|||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
#include <string_view>
|
||||||
|
#include <utility>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "dwarfs/file_category.h"
|
||||||
|
|
||||||
namespace dwarfs {
|
namespace dwarfs {
|
||||||
|
|
||||||
@ -54,6 +59,10 @@ class inode_manager {
|
|||||||
impl_->for_each_inode_in_order(fn);
|
impl_->for_each_inode_in_order(fn);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<std::pair<file_category, size_t>> category_counts() const {
|
||||||
|
return impl_->category_counts();
|
||||||
|
}
|
||||||
|
|
||||||
class impl {
|
class impl {
|
||||||
public:
|
public:
|
||||||
virtual ~impl() = default;
|
virtual ~impl() = default;
|
||||||
@ -65,6 +74,8 @@ class inode_manager {
|
|||||||
file_order_options const& file_order, order_cb const& fn) = 0;
|
file_order_options const& file_order, order_cb const& fn) = 0;
|
||||||
virtual void for_each_inode_in_order(
|
virtual void for_each_inode_in_order(
|
||||||
std::function<void(std::shared_ptr<inode> const&)> const& fn) const = 0;
|
std::function<void(std::shared_ptr<inode> const&)> const& fn) const = 0;
|
||||||
|
virtual std::vector<std::pair<file_category, size_t>>
|
||||||
|
category_counts() const = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -25,6 +25,7 @@
|
|||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include <iosfwd>
|
#include <iosfwd>
|
||||||
|
#include <memory>
|
||||||
#include <optional>
|
#include <optional>
|
||||||
|
|
||||||
#include "dwarfs/file_stat.h"
|
#include "dwarfs/file_stat.h"
|
||||||
@ -32,6 +33,7 @@
|
|||||||
|
|
||||||
namespace dwarfs {
|
namespace dwarfs {
|
||||||
|
|
||||||
|
class categorizer_manager;
|
||||||
class entry;
|
class entry;
|
||||||
|
|
||||||
enum class mlock_mode { NONE, TRY, MUST };
|
enum class mlock_mode { NONE, TRY, MUST };
|
||||||
@ -78,11 +80,12 @@ struct inode_options {
|
|||||||
bool with_similarity{false};
|
bool with_similarity{false};
|
||||||
bool with_nilsimsa{false};
|
bool with_nilsimsa{false};
|
||||||
std::optional<size_t> max_similarity_scan_size;
|
std::optional<size_t> max_similarity_scan_size;
|
||||||
|
std::shared_ptr<categorizer_manager> categorizer_mgr;
|
||||||
|
|
||||||
bool needs_scan(size_t size) const {
|
bool needs_scan(size_t size) const {
|
||||||
return (with_similarity || with_nilsimsa) &&
|
return categorizer_mgr || ((with_similarity || with_nilsimsa) &&
|
||||||
(!max_similarity_scan_size ||
|
(!max_similarity_scan_size ||
|
||||||
size <= max_similarity_scan_size.value());
|
size <= max_similarity_scan_size.value()));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
268
src/dwarfs/categorizer.cpp
Normal file
268
src/dwarfs/categorizer.cpp
Normal file
@ -0,0 +1,268 @@
|
|||||||
|
/* vim:set ts=2 sw=2 sts=2 et: */
|
||||||
|
/**
|
||||||
|
* \author Marcus Holland-Moritz (github@mhxnet.de)
|
||||||
|
* \copyright Copyright (c) Marcus Holland-Moritz
|
||||||
|
*
|
||||||
|
* This file is part of dwarfs.
|
||||||
|
*
|
||||||
|
* dwarfs is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* dwarfs is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
|
#include <boost/program_options.hpp>
|
||||||
|
|
||||||
|
#include <fmt/format.h>
|
||||||
|
|
||||||
|
#include <folly/container/Enumerate.h>
|
||||||
|
|
||||||
|
#include "dwarfs/categorizer.h"
|
||||||
|
#include "dwarfs/compiler.h"
|
||||||
|
#include "dwarfs/error.h"
|
||||||
|
#include "dwarfs/logger.h"
|
||||||
|
|
||||||
|
namespace dwarfs {
|
||||||
|
|
||||||
|
namespace po = boost::program_options;
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
constexpr std::string_view const DEFAULT_CATEGORY{"<default>"};
|
||||||
|
}
|
||||||
|
|
||||||
|
class categorizer_manager_private {
|
||||||
|
public:
|
||||||
|
virtual ~categorizer_manager_private() = default;
|
||||||
|
|
||||||
|
virtual std::vector<std::shared_ptr<categorizer const>> const&
|
||||||
|
categorizers() const = 0;
|
||||||
|
virtual file_category category(std::string_view cat) const = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename LoggerPolicy>
|
||||||
|
class categorizer_job_ final : public categorizer_job::impl {
|
||||||
|
public:
|
||||||
|
categorizer_job_(logger& lgr, categorizer_manager_private const& mgr,
|
||||||
|
std::filesystem::path const& path)
|
||||||
|
: LOG_PROXY_INIT(lgr)
|
||||||
|
, mgr_{mgr}
|
||||||
|
, path_{path} {}
|
||||||
|
|
||||||
|
void categorize_random_access(std::span<uint8_t const> data) override;
|
||||||
|
void categorize_sequential(std::span<uint8_t const> data) override;
|
||||||
|
file_category result() override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
LOG_PROXY_DECL(LoggerPolicy);
|
||||||
|
categorizer_manager_private const& mgr_;
|
||||||
|
|
||||||
|
std::string_view best_{DEFAULT_CATEGORY};
|
||||||
|
int index_{-1};
|
||||||
|
bool is_global_best_{false};
|
||||||
|
size_t total_size_hint_{0};
|
||||||
|
std::vector<std::pair<int, std::unique_ptr<sequential_categorizer_job>>>
|
||||||
|
seq_jobs_;
|
||||||
|
std::filesystem::path const path_;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename LoggerPolicy>
|
||||||
|
void categorizer_job_<LoggerPolicy>::categorize_random_access(
|
||||||
|
std::span<uint8_t const> data) {
|
||||||
|
DWARFS_CHECK(index_ < 0,
|
||||||
|
"internal error: index already set in categorize_random_access");
|
||||||
|
|
||||||
|
total_size_hint_ = data.size();
|
||||||
|
|
||||||
|
bool global_best = true;
|
||||||
|
|
||||||
|
for (auto&& [index, cat] : folly::enumerate(mgr_.categorizers())) {
|
||||||
|
if (auto p = dynamic_cast<random_access_categorizer const*>(cat.get())) {
|
||||||
|
if (auto c = p->categorize(path_, data)) {
|
||||||
|
best_ = *c;
|
||||||
|
index_ = index;
|
||||||
|
is_global_best_ = global_best;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
global_best = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename LoggerPolicy>
|
||||||
|
void categorizer_job_<LoggerPolicy>::categorize_sequential(
|
||||||
|
std::span<uint8_t const> data) {
|
||||||
|
if (is_global_best_) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (seq_jobs_.empty()) [[unlikely]] {
|
||||||
|
for (auto&& [index, cat] : folly::enumerate(mgr_.categorizers())) {
|
||||||
|
if (index_ >= 0 && static_cast<int>(index) >= index_) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (auto p = dynamic_cast<sequential_categorizer const*>(cat.get())) {
|
||||||
|
if (auto job = p->job(path_, total_size_hint_)) {
|
||||||
|
seq_jobs_.emplace_back(index, std::move(job));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto&& [index, job] : seq_jobs_) {
|
||||||
|
job->add(data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename LoggerPolicy>
|
||||||
|
file_category categorizer_job_<LoggerPolicy>::result() {
|
||||||
|
if (!seq_jobs_.empty()) {
|
||||||
|
for (auto&& [index, job] : seq_jobs_) {
|
||||||
|
if (auto c = job->result()) {
|
||||||
|
assert(index_ < 0 || index < index_);
|
||||||
|
best_ = *c;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
seq_jobs_.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_TRACE << path_ << " -> " << best_;
|
||||||
|
|
||||||
|
return mgr_.category(best_);
|
||||||
|
}
|
||||||
|
|
||||||
|
categorizer_job::categorizer_job() = default;
|
||||||
|
|
||||||
|
categorizer_job::categorizer_job(std::unique_ptr<impl> impl)
|
||||||
|
: impl_{std::move(impl)} {}
|
||||||
|
|
||||||
|
template <typename LoggerPolicy>
|
||||||
|
class categorizer_manager_ final : public categorizer_manager::impl,
|
||||||
|
public categorizer_manager_private {
|
||||||
|
public:
|
||||||
|
categorizer_manager_(logger& lgr)
|
||||||
|
: lgr_{lgr}
|
||||||
|
, LOG_PROXY_INIT(lgr) {
|
||||||
|
add_category(DEFAULT_CATEGORY);
|
||||||
|
}
|
||||||
|
|
||||||
|
void add(std::shared_ptr<categorizer const> c) override;
|
||||||
|
categorizer_job job(std::filesystem::path const& path) const override;
|
||||||
|
std::string_view category_name(file_category c) const override;
|
||||||
|
|
||||||
|
std::vector<std::shared_ptr<categorizer const>> const&
|
||||||
|
categorizers() const override {
|
||||||
|
return categorizers_;
|
||||||
|
}
|
||||||
|
|
||||||
|
file_category category(std::string_view cat) const override {
|
||||||
|
auto it = catmap_.find(cat);
|
||||||
|
DWARFS_CHECK(it != catmap_.end(), fmt::format("unknown category: {}", cat));
|
||||||
|
return it->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
void add_category(std::string_view cat) {
|
||||||
|
if (catmap_.emplace(cat, categories_.size()).second) {
|
||||||
|
categories_.emplace_back(cat);
|
||||||
|
} else {
|
||||||
|
LOG_WARN << "duplicate category: " << cat;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logger& lgr_;
|
||||||
|
LOG_PROXY_DECL(LoggerPolicy);
|
||||||
|
std::vector<std::shared_ptr<categorizer const>> categorizers_;
|
||||||
|
std::vector<std::string_view> categories_;
|
||||||
|
std::unordered_map<std::string_view, file_category> catmap_;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename LoggerPolicy>
|
||||||
|
void categorizer_manager_<LoggerPolicy>::add(
|
||||||
|
std::shared_ptr<categorizer const> c) {
|
||||||
|
for (auto const& c : c->categories()) {
|
||||||
|
add_category(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
categorizers_.emplace_back(std::move(c));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename LoggerPolicy>
|
||||||
|
categorizer_job categorizer_manager_<LoggerPolicy>::job(
|
||||||
|
std::filesystem::path const& path) const {
|
||||||
|
return categorizer_job(
|
||||||
|
make_unique_logging_object<categorizer_job::impl, categorizer_job_,
|
||||||
|
logger_policies>(lgr_, *this, path));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename LoggerPolicy>
|
||||||
|
std::string_view
|
||||||
|
categorizer_manager_<LoggerPolicy>::category_name(file_category c) const {
|
||||||
|
return DWARFS_NOTHROW(categories_.at(c.value()));
|
||||||
|
}
|
||||||
|
|
||||||
|
categorizer_manager::categorizer_manager(logger& lgr)
|
||||||
|
: impl_(make_unique_logging_object<impl, categorizer_manager_,
|
||||||
|
logger_policies>(lgr)) {}
|
||||||
|
|
||||||
|
categorizer_registry& categorizer_registry::instance() {
|
||||||
|
static categorizer_registry the_instance;
|
||||||
|
return the_instance;
|
||||||
|
}
|
||||||
|
|
||||||
|
void categorizer_registry::register_factory(
|
||||||
|
std::unique_ptr<categorizer_factory const>&& factory) {
|
||||||
|
auto name = factory->name();
|
||||||
|
|
||||||
|
if (!factories_.emplace(name, std::move(factory)).second) {
|
||||||
|
std::cerr << "categorizer factory name conflict (" << name << "\n";
|
||||||
|
::abort();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unique_ptr<categorizer>
|
||||||
|
categorizer_registry::create(logger& lgr, std::string const& name,
|
||||||
|
po::variables_map const& vm) const {
|
||||||
|
auto it = factories_.find(name);
|
||||||
|
|
||||||
|
if (it == factories_.end()) {
|
||||||
|
DWARFS_THROW(runtime_error, "unknown categorizer: " + name);
|
||||||
|
}
|
||||||
|
|
||||||
|
return it->second->create(lgr, vm);
|
||||||
|
}
|
||||||
|
|
||||||
|
void categorizer_registry::add_options(po::options_description& opts) const {
|
||||||
|
for (auto& f : factories_) {
|
||||||
|
if (auto f_opts = f.second->options()) {
|
||||||
|
opts.add(*f_opts);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> categorizer_registry::categorizer_names() const {
|
||||||
|
std::vector<std::string> rv;
|
||||||
|
for (auto& f : factories_) {
|
||||||
|
rv.emplace_back(f.first);
|
||||||
|
}
|
||||||
|
return rv;
|
||||||
|
}
|
||||||
|
|
||||||
|
categorizer_registry::categorizer_registry() = default;
|
||||||
|
categorizer_registry::~categorizer_registry() = default;
|
||||||
|
|
||||||
|
} // namespace dwarfs
|
129
src/dwarfs/categorizer/binary_categorizer.cpp
Normal file
129
src/dwarfs/categorizer/binary_categorizer.cpp
Normal file
@ -0,0 +1,129 @@
|
|||||||
|
/* vim:set ts=2 sw=2 sts=2 et: */
|
||||||
|
/**
|
||||||
|
* \author Marcus Holland-Moritz (github@mhxnet.de)
|
||||||
|
* \copyright Copyright (c) Marcus Holland-Moritz
|
||||||
|
*
|
||||||
|
* This file is part of dwarfs.
|
||||||
|
*
|
||||||
|
* dwarfs is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* dwarfs is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <array>
|
||||||
|
#include <cstring>
|
||||||
|
#include <map>
|
||||||
|
#include <stack>
|
||||||
|
#include <unordered_set>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include <boost/program_options.hpp>
|
||||||
|
|
||||||
|
#include <fmt/format.h>
|
||||||
|
|
||||||
|
// TODO: this should obvs. work everywhere
|
||||||
|
#ifndef _WIN32
|
||||||
|
#include <elf.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "dwarfs/categorizer.h"
|
||||||
|
#include "dwarfs/error.h"
|
||||||
|
#include "dwarfs/logger.h"
|
||||||
|
|
||||||
|
namespace dwarfs {
|
||||||
|
|
||||||
|
namespace po = boost::program_options;
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
constexpr std::string_view const SOME_CATEGORY{"bla"};
|
||||||
|
|
||||||
|
class binary_categorizer_base : public random_access_categorizer {
|
||||||
|
public:
|
||||||
|
std::span<std::string_view const> categories() const override;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename LoggerPolicy>
|
||||||
|
class binary_categorizer_ final : public binary_categorizer_base {
|
||||||
|
public:
|
||||||
|
binary_categorizer_(logger& lgr)
|
||||||
|
: LOG_PROXY_INIT(lgr) {}
|
||||||
|
|
||||||
|
std::optional<std::string_view>
|
||||||
|
categorize(std::filesystem::path const& path,
|
||||||
|
std::span<uint8_t const> data) const override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
LOG_PROXY_DECL(LoggerPolicy);
|
||||||
|
};
|
||||||
|
|
||||||
|
std::span<std::string_view const> binary_categorizer_base::categories() const {
|
||||||
|
static constexpr std::array const s_categories{
|
||||||
|
SOME_CATEGORY,
|
||||||
|
};
|
||||||
|
return s_categories;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename LoggerPolicy>
|
||||||
|
std::optional<std::string_view>
|
||||||
|
binary_categorizer_<LoggerPolicy>::categorize(std::filesystem::path const&,
|
||||||
|
std::span<uint8_t const> data
|
||||||
|
[[maybe_unused]]) const {
|
||||||
|
#ifndef _WIN32
|
||||||
|
auto p = data.data();
|
||||||
|
if (data.size() >= EI_NIDENT && ::memcmp(p, ELFMAG, 4) == 0) {
|
||||||
|
switch (p[EI_OSABI]) {
|
||||||
|
case ELFOSABI_SYSV: // 0 /* UNIX System V ABI */
|
||||||
|
case ELFOSABI_HPUX: // 1 /* HP-UX */
|
||||||
|
case ELFOSABI_NETBSD: // 2 /* NetBSD. */
|
||||||
|
case ELFOSABI_GNU: // 3 /* Object uses GNU ELF extensions. */
|
||||||
|
case ELFOSABI_SOLARIS: // 6 /* Sun Solaris. */
|
||||||
|
case ELFOSABI_AIX: // 7 /* IBM AIX. */
|
||||||
|
case ELFOSABI_IRIX: // 8 /* SGI Irix. */
|
||||||
|
case ELFOSABI_FREEBSD: // 9 /* FreeBSD. */
|
||||||
|
case ELFOSABI_TRU64: // 10 /* Compaq TRU64 UNIX. */
|
||||||
|
case ELFOSABI_MODESTO: // 11 /* Novell Modesto. */
|
||||||
|
case ELFOSABI_OPENBSD: // 12 /* OpenBSD. */
|
||||||
|
case ELFOSABI_ARM_AEABI: // 64 /* ARM EABI */
|
||||||
|
case ELFOSABI_ARM: // 97 /* ARM */
|
||||||
|
case ELFOSABI_STANDALONE: // 255 /* Standalone (embedded) application */
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return std::nullopt;
|
||||||
|
}
|
||||||
|
|
||||||
|
class binary_categorizer_factory : public categorizer_factory {
|
||||||
|
public:
|
||||||
|
std::string_view name() const override { return "binary"; }
|
||||||
|
|
||||||
|
std::shared_ptr<boost::program_options::options_description const>
|
||||||
|
options() const override {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unique_ptr<categorizer>
|
||||||
|
create(logger& lgr, po::variables_map const& /*vm*/) const override {
|
||||||
|
return make_unique_logging_object<categorizer, binary_categorizer_,
|
||||||
|
logger_policies>(lgr);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
REGISTER_CATEGORIZER_FACTORY(binary_categorizer_factory)
|
||||||
|
|
||||||
|
} // namespace dwarfs
|
232
src/dwarfs/categorizer/incompressible_categorizer.cpp
Normal file
232
src/dwarfs/categorizer/incompressible_categorizer.cpp
Normal file
@ -0,0 +1,232 @@
|
|||||||
|
/* vim:set ts=2 sw=2 sts=2 et: */
|
||||||
|
/**
|
||||||
|
* \author Marcus Holland-Moritz (github@mhxnet.de)
|
||||||
|
* \copyright Copyright (c) Marcus Holland-Moritz
|
||||||
|
*
|
||||||
|
* This file is part of dwarfs.
|
||||||
|
*
|
||||||
|
* dwarfs is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* dwarfs is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <array>
|
||||||
|
#include <cassert>
|
||||||
|
#include <cstring>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include <boost/program_options.hpp>
|
||||||
|
|
||||||
|
#include <fmt/format.h>
|
||||||
|
|
||||||
|
#include <lz4.h>
|
||||||
|
|
||||||
|
#include "dwarfs/categorizer.h"
|
||||||
|
#include "dwarfs/error.h"
|
||||||
|
#include "dwarfs/logger.h"
|
||||||
|
|
||||||
|
namespace dwarfs {
|
||||||
|
|
||||||
|
namespace po = boost::program_options;
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
constexpr std::string_view const INCOMPRESSIBLE_CATEGORY{"incompressible"};
|
||||||
|
|
||||||
|
struct incompressible_categorizer_config {
|
||||||
|
size_t min_input_size;
|
||||||
|
double max_ratio_size;
|
||||||
|
double max_ratio_blocks;
|
||||||
|
int lz4_acceleration;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename LoggerPolicy>
|
||||||
|
class incompressible_categorizer_job_ : public sequential_categorizer_job {
|
||||||
|
public:
|
||||||
|
static constexpr size_t const block_size{1024 * 1024};
|
||||||
|
|
||||||
|
incompressible_categorizer_job_(logger& lgr,
|
||||||
|
incompressible_categorizer_config const& cfg,
|
||||||
|
std::filesystem::path const& path,
|
||||||
|
size_t total_size)
|
||||||
|
: LOG_PROXY_INIT(lgr)
|
||||||
|
, cfg_{cfg}
|
||||||
|
, path_{path} {
|
||||||
|
input_.reserve(total_size < block_size ? total_size : block_size);
|
||||||
|
state_ = ::malloc(LZ4_sizeofState());
|
||||||
|
}
|
||||||
|
|
||||||
|
~incompressible_categorizer_job_() { ::free(state_); }
|
||||||
|
|
||||||
|
void add(std::span<uint8_t const> data) override {
|
||||||
|
while (!data.empty()) {
|
||||||
|
auto part_size = input_.size() + data.size() <= block_size
|
||||||
|
? data.size()
|
||||||
|
: block_size - input_.size();
|
||||||
|
add_input(data.first(part_size));
|
||||||
|
data = data.subspan(part_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::optional<std::string_view> result() override {
|
||||||
|
if (!input_.empty()) {
|
||||||
|
compress();
|
||||||
|
}
|
||||||
|
LOG_TRACE << path_ << " -> blocks: " << incompressible_blocks_ << "/"
|
||||||
|
<< total_blocks_ << ", total compression ratio: "
|
||||||
|
<< fmt::format("{:.2f}%",
|
||||||
|
100.0 * total_output_size_ / total_input_size_);
|
||||||
|
if (total_blocks_ > 0 &&
|
||||||
|
(total_output_size_ >= cfg_.max_ratio_size * total_input_size_ ||
|
||||||
|
incompressible_blocks_ >= cfg_.max_ratio_blocks * total_blocks_)) {
|
||||||
|
return INCOMPRESSIBLE_CATEGORY;
|
||||||
|
}
|
||||||
|
return std::nullopt;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
void add_input(std::span<uint8_t const> data) {
|
||||||
|
auto current_size = input_.size();
|
||||||
|
assert(current_size + data.size() <= block_size);
|
||||||
|
input_.resize(current_size + data.size());
|
||||||
|
::memcpy(&input_[current_size], data.data(), data.size());
|
||||||
|
if (input_.size() == block_size) {
|
||||||
|
compress();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void compress() {
|
||||||
|
total_input_size_ += input_.size();
|
||||||
|
|
||||||
|
output_.resize(::LZ4_compressBound(input_.size()));
|
||||||
|
|
||||||
|
auto rv = ::LZ4_compress_fast_extState(
|
||||||
|
state_, reinterpret_cast<char*>(input_.data()),
|
||||||
|
reinterpret_cast<char*>(output_.data()), input_.size(), output_.size(),
|
||||||
|
cfg_.lz4_acceleration);
|
||||||
|
|
||||||
|
if (rv == 0) {
|
||||||
|
DWARFS_THROW(runtime_error,
|
||||||
|
"unexpected error in LZ4_compress_fast_extState");
|
||||||
|
}
|
||||||
|
|
||||||
|
total_output_size_ += rv;
|
||||||
|
++total_blocks_;
|
||||||
|
|
||||||
|
if (rv >= static_cast<int>(cfg_.max_ratio_size * input_.size())) {
|
||||||
|
++incompressible_blocks_;
|
||||||
|
}
|
||||||
|
|
||||||
|
input_.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_PROXY_DECL(LoggerPolicy);
|
||||||
|
void* state_;
|
||||||
|
std::vector<uint8_t> input_;
|
||||||
|
std::vector<uint8_t> output_;
|
||||||
|
size_t total_input_size_{0};
|
||||||
|
size_t total_output_size_{0};
|
||||||
|
size_t total_blocks_{0};
|
||||||
|
size_t incompressible_blocks_{0};
|
||||||
|
incompressible_categorizer_config const& cfg_;
|
||||||
|
std::filesystem::path const& path_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class incompressible_categorizer_ final : public sequential_categorizer {
|
||||||
|
public:
|
||||||
|
incompressible_categorizer_(logger& lgr,
|
||||||
|
incompressible_categorizer_config const& cfg);
|
||||||
|
|
||||||
|
std::span<std::string_view const> categories() const override;
|
||||||
|
std::unique_ptr<sequential_categorizer_job>
|
||||||
|
job(std::filesystem::path const& path, size_t total_size) const override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
logger& lgr_;
|
||||||
|
incompressible_categorizer_config const config_;
|
||||||
|
};
|
||||||
|
|
||||||
|
incompressible_categorizer_::incompressible_categorizer_(
|
||||||
|
logger& lgr, incompressible_categorizer_config const& cfg)
|
||||||
|
: lgr_{lgr}
|
||||||
|
, config_{cfg} {}
|
||||||
|
|
||||||
|
std::span<std::string_view const>
|
||||||
|
incompressible_categorizer_::categories() const {
|
||||||
|
static constexpr std::array const s_categories{
|
||||||
|
INCOMPRESSIBLE_CATEGORY,
|
||||||
|
};
|
||||||
|
return s_categories;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unique_ptr<sequential_categorizer_job>
|
||||||
|
incompressible_categorizer_::job(std::filesystem::path const& path,
|
||||||
|
size_t total_size) const {
|
||||||
|
if (total_size < config_.min_input_size) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
return make_unique_logging_object<sequential_categorizer_job,
|
||||||
|
incompressible_categorizer_job_,
|
||||||
|
logger_policies>(lgr_, config_, path,
|
||||||
|
total_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
class incompressible_categorizer_factory : public categorizer_factory {
|
||||||
|
public:
|
||||||
|
incompressible_categorizer_factory()
|
||||||
|
: opts_{std::make_shared<po::options_description>(
|
||||||
|
"Incompressible categorizer options")} {
|
||||||
|
static constexpr double const default_ratio{0.99};
|
||||||
|
auto const default_ratio_str{fmt::format("{:.2f}", default_ratio)};
|
||||||
|
// clang-format off
|
||||||
|
opts_->add_options()
|
||||||
|
("incompressible-min-input-size",
|
||||||
|
po::value<size_t>(&cfg_.min_input_size)->default_value(256),
|
||||||
|
"minimum file size in bytes to check for incompressibility")
|
||||||
|
("incompressible-max-size-ratio",
|
||||||
|
po::value<double>(&cfg_.max_ratio_size)
|
||||||
|
->default_value(default_ratio, default_ratio_str),
|
||||||
|
"LZ4 compression ratio above files are considered incompressible")
|
||||||
|
("incompressible-max-blocks-ratio",
|
||||||
|
po::value<double>(&cfg_.max_ratio_blocks)
|
||||||
|
->default_value(default_ratio, default_ratio_str),
|
||||||
|
"ratio of incompressible LZ4 blocks above which the whole file"
|
||||||
|
" is considered incompressible")
|
||||||
|
("incompressible-lz4-acceleration (1..65537)",
|
||||||
|
po::value<int>(&cfg_.lz4_acceleration)->default_value(1),
|
||||||
|
"LZ4 acceleration value")
|
||||||
|
;
|
||||||
|
// clang-format on
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string_view name() const override { return "incompressible"; }
|
||||||
|
|
||||||
|
std::shared_ptr<po::options_description const> options() const override {
|
||||||
|
return opts_;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unique_ptr<categorizer>
|
||||||
|
create(logger& lgr, po::variables_map const& /*vm*/) const override {
|
||||||
|
return std::make_unique<incompressible_categorizer_>(lgr, cfg_);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
incompressible_categorizer_config cfg_;
|
||||||
|
std::shared_ptr<po::options_description> opts_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
REGISTER_CATEGORIZER_FACTORY(incompressible_categorizer_factory)
|
||||||
|
|
||||||
|
} // namespace dwarfs
|
198
src/dwarfs/categorizer/libmagic_categorizer.cpp
Normal file
198
src/dwarfs/categorizer/libmagic_categorizer.cpp
Normal file
@ -0,0 +1,198 @@
|
|||||||
|
/* vim:set ts=2 sw=2 sts=2 et: */
|
||||||
|
/**
|
||||||
|
* \author Marcus Holland-Moritz (github@mhxnet.de)
|
||||||
|
* \copyright Copyright (c) Marcus Holland-Moritz
|
||||||
|
*
|
||||||
|
* This file is part of dwarfs.
|
||||||
|
*
|
||||||
|
* dwarfs is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* dwarfs is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <array>
|
||||||
|
#include <map>
|
||||||
|
#include <stack>
|
||||||
|
#include <unordered_set>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include <boost/program_options.hpp>
|
||||||
|
|
||||||
|
#include <fmt/format.h>
|
||||||
|
|
||||||
|
#include <folly/Synchronized.h>
|
||||||
|
|
||||||
|
#include <magic.h>
|
||||||
|
|
||||||
|
#include "dwarfs/categorizer.h"
|
||||||
|
#include "dwarfs/error.h"
|
||||||
|
#include "dwarfs/logger.h"
|
||||||
|
|
||||||
|
namespace dwarfs {
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
namespace po = boost::program_options;
|
||||||
|
|
||||||
|
constexpr std::string_view const SOME_CATEGORY{"bla"};
|
||||||
|
|
||||||
|
std::unordered_set<std::string_view> executable_mime_types{
|
||||||
|
"application/x-executable",
|
||||||
|
"application/x-sharedlib",
|
||||||
|
};
|
||||||
|
|
||||||
|
class magic_wrapper {
|
||||||
|
public:
|
||||||
|
magic_wrapper() = default;
|
||||||
|
|
||||||
|
size_t cookie_count() const {
|
||||||
|
auto rlock = cookies_.rlock();
|
||||||
|
return rlock->size();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string identify(std::span<uint8_t const> data) const {
|
||||||
|
std::string rv;
|
||||||
|
scoped_cookie m(*this);
|
||||||
|
if (auto id = ::magic_buffer(m.get(), data.data(), data.size())) {
|
||||||
|
rv.assign(id);
|
||||||
|
}
|
||||||
|
if (rv.starts_with("application/")) {
|
||||||
|
::magic_setflags(m.get(), MAGIC_NONE);
|
||||||
|
if (auto id = ::magic_buffer(m.get(), data.data(), data.size())) {
|
||||||
|
rv += "; " + std::string(id);
|
||||||
|
}
|
||||||
|
::magic_setflags(m.get(), MAGIC_MIME_TYPE);
|
||||||
|
}
|
||||||
|
return rv;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
using magic_cookie_t =
|
||||||
|
std::unique_ptr<struct ::magic_set, decltype(&::magic_close)>;
|
||||||
|
|
||||||
|
magic_cookie_t new_cookie() const {
|
||||||
|
magic_cookie_t m(::magic_open(MAGIC_MIME_TYPE), &::magic_close);
|
||||||
|
if (!m) {
|
||||||
|
throw std::runtime_error("could not create magic cookie");
|
||||||
|
}
|
||||||
|
if (::magic_load(m.get(), NULL) != 0) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
fmt::format("(magic) {}", ::magic_error(m.get())));
|
||||||
|
}
|
||||||
|
return m;
|
||||||
|
}
|
||||||
|
|
||||||
|
class scoped_cookie {
|
||||||
|
public:
|
||||||
|
scoped_cookie(magic_wrapper const& w)
|
||||||
|
: cookie_{get_scoped_cookie(w)}
|
||||||
|
, w_{w} {}
|
||||||
|
|
||||||
|
~scoped_cookie() {
|
||||||
|
auto wlock = w_.cookies_.wlock();
|
||||||
|
wlock->push(std::move(cookie_));
|
||||||
|
}
|
||||||
|
|
||||||
|
::magic_t get() const { return cookie_.get(); }
|
||||||
|
|
||||||
|
private:
|
||||||
|
static magic_cookie_t get_scoped_cookie(magic_wrapper const& w) {
|
||||||
|
auto wlock = w.cookies_.wlock();
|
||||||
|
if (wlock->empty()) [[unlikely]] {
|
||||||
|
return w.new_cookie();
|
||||||
|
}
|
||||||
|
auto cookie = std::move(wlock->top());
|
||||||
|
wlock->pop();
|
||||||
|
return cookie;
|
||||||
|
}
|
||||||
|
|
||||||
|
magic_cookie_t cookie_;
|
||||||
|
magic_wrapper const& w_;
|
||||||
|
};
|
||||||
|
|
||||||
|
mutable folly::Synchronized<std::stack<magic_cookie_t>> cookies_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class libmagic_categorizer_base : public random_access_categorizer {
|
||||||
|
public:
|
||||||
|
std::span<std::string_view const> categories() const override;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename LoggerPolicy>
|
||||||
|
class libmagic_categorizer_ final : public libmagic_categorizer_base {
|
||||||
|
public:
|
||||||
|
explicit libmagic_categorizer_(logger& lgr)
|
||||||
|
: LOG_PROXY_INIT(lgr) {}
|
||||||
|
|
||||||
|
~libmagic_categorizer_() {
|
||||||
|
LOG_INFO << m_.cookie_count() << " magic cookies were used";
|
||||||
|
{
|
||||||
|
auto rlock = mimetypes_.rlock();
|
||||||
|
for (auto const& [k, v] : *rlock) {
|
||||||
|
LOG_INFO << k << " -> " << v;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::optional<std::string_view>
|
||||||
|
categorize(std::filesystem::path const& path,
|
||||||
|
std::span<uint8_t const> data) const override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
LOG_PROXY_DECL(LoggerPolicy);
|
||||||
|
magic_wrapper m_;
|
||||||
|
mutable folly::Synchronized<std::map<std::string, size_t>> mimetypes_;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::span<std::string_view const>
|
||||||
|
libmagic_categorizer_base::categories() const {
|
||||||
|
static constexpr std::array const s_categories{
|
||||||
|
SOME_CATEGORY,
|
||||||
|
};
|
||||||
|
return s_categories;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename LoggerPolicy>
|
||||||
|
std::optional<std::string_view> libmagic_categorizer_<LoggerPolicy>::categorize(
|
||||||
|
std::filesystem::path const& path, std::span<uint8_t const> data) const {
|
||||||
|
auto id = m_.identify(data);
|
||||||
|
LOG_DEBUG << path << " -> (magic) " << id;
|
||||||
|
{
|
||||||
|
auto wlock = mimetypes_.wlock();
|
||||||
|
++(*wlock)[id];
|
||||||
|
}
|
||||||
|
return std::nullopt;
|
||||||
|
}
|
||||||
|
|
||||||
|
class libmagic_categorizer_factory : public categorizer_factory {
|
||||||
|
public:
|
||||||
|
std::string_view name() const override { return "libmagic"; }
|
||||||
|
|
||||||
|
std::shared_ptr<boost::program_options::options_description const>
|
||||||
|
options() const override {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unique_ptr<categorizer>
|
||||||
|
create(logger& lgr, po::variables_map const& /*vm*/) const override {
|
||||||
|
return make_unique_logging_object<categorizer, libmagic_categorizer_,
|
||||||
|
logger_policies>(lgr);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
REGISTER_CATEGORIZER_FACTORY(libmagic_categorizer_factory)
|
||||||
|
|
||||||
|
} // namespace dwarfs
|
@ -29,10 +29,12 @@
|
|||||||
#include <limits>
|
#include <limits>
|
||||||
#include <numeric>
|
#include <numeric>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <unordered_map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include <fmt/format.h>
|
#include <fmt/format.h>
|
||||||
|
|
||||||
|
#include "dwarfs/categorizer.h"
|
||||||
#include "dwarfs/compiler.h"
|
#include "dwarfs/compiler.h"
|
||||||
#include "dwarfs/entry.h"
|
#include "dwarfs/entry.h"
|
||||||
#include "dwarfs/error.h"
|
#include "dwarfs/error.h"
|
||||||
@ -144,8 +146,19 @@ class inode_ : public inode {
|
|||||||
similarity sc;
|
similarity sc;
|
||||||
nilsimsa nc;
|
nilsimsa nc;
|
||||||
|
|
||||||
|
categorizer_job catjob;
|
||||||
|
|
||||||
|
if (opts.categorizer_mgr) {
|
||||||
|
catjob =
|
||||||
|
opts.categorizer_mgr->job(mm ? mm->path().string() : "<no-file>");
|
||||||
|
}
|
||||||
|
|
||||||
if (mm) {
|
if (mm) {
|
||||||
auto update_hashes = [&](uint8_t const* data, size_t size) {
|
if (catjob) {
|
||||||
|
catjob.categorize_random_access(mm->span());
|
||||||
|
}
|
||||||
|
|
||||||
|
auto scan_sequential = [&](uint8_t const* data, size_t size) {
|
||||||
if (opts.with_similarity) {
|
if (opts.with_similarity) {
|
||||||
sc.update(data, size);
|
sc.update(data, size);
|
||||||
}
|
}
|
||||||
@ -153,6 +166,10 @@ class inode_ : public inode {
|
|||||||
if (opts.with_nilsimsa) {
|
if (opts.with_nilsimsa) {
|
||||||
nc.update(data, size);
|
nc.update(data, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (catjob) {
|
||||||
|
catjob.categorize_sequential(std::span(data, size));
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
constexpr size_t chunk_size = 32 << 20;
|
constexpr size_t chunk_size = 32 << 20;
|
||||||
@ -160,13 +177,13 @@ class inode_ : public inode {
|
|||||||
size_t size = mm->size();
|
size_t size = mm->size();
|
||||||
|
|
||||||
while (size >= chunk_size) {
|
while (size >= chunk_size) {
|
||||||
update_hashes(mm->as<uint8_t>(offset), chunk_size);
|
scan_sequential(mm->as<uint8_t>(offset), chunk_size);
|
||||||
mm->release_until(offset);
|
mm->release_until(offset);
|
||||||
offset += chunk_size;
|
offset += chunk_size;
|
||||||
size -= chunk_size;
|
size -= chunk_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
update_hashes(mm->as<uint8_t>(offset), size);
|
scan_sequential(mm->as<uint8_t>(offset), size);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (opts.with_similarity) {
|
if (opts.with_similarity) {
|
||||||
@ -182,6 +199,10 @@ class inode_ : public inode {
|
|||||||
nilsimsa_valid_ = true;
|
nilsimsa_valid_ = true;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (catjob) {
|
||||||
|
category_ = catjob.result();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void add_chunk(size_t block, size_t offset, size_t size) override {
|
void add_chunk(size_t block, size_t offset, size_t size) override {
|
||||||
@ -207,9 +228,12 @@ class inode_ : public inode {
|
|||||||
vec.insert(vec.end(), chunks_.begin(), chunks_.end());
|
vec.insert(vec.end(), chunks_.begin(), chunks_.end());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
file_category category() const override { return category_; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::optional<uint32_t> num_;
|
std::optional<uint32_t> num_;
|
||||||
uint32_t similarity_hash_{0};
|
uint32_t similarity_hash_{0};
|
||||||
|
file_category category_;
|
||||||
files_vector files_;
|
files_vector files_;
|
||||||
std::vector<chunk_type> chunks_;
|
std::vector<chunk_type> chunks_;
|
||||||
nilsimsa::hash_type nilsimsa_similarity_hash_;
|
nilsimsa::hash_type nilsimsa_similarity_hash_;
|
||||||
@ -254,6 +278,27 @@ class inode_manager_ final : public inode_manager::impl {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<std::pair<file_category, size_t>>
|
||||||
|
category_counts() const override {
|
||||||
|
std::unordered_map<file_category::value_type, size_t> tmp;
|
||||||
|
|
||||||
|
for (auto const& i : inodes_) {
|
||||||
|
++tmp[i->category().value()];
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::pair<file_category, size_t>> rv;
|
||||||
|
|
||||||
|
for (auto const& [k, v] : tmp) {
|
||||||
|
rv.emplace_back(k, v);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::sort(rv.begin(), rv.end(), [](auto const& a, auto const& b) {
|
||||||
|
return a.first.value() < b.first.value();
|
||||||
|
});
|
||||||
|
|
||||||
|
return rv;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void order_inodes_by_path() {
|
void order_inodes_by_path() {
|
||||||
std::vector<std::string> paths;
|
std::vector<std::string> paths;
|
||||||
|
@ -37,6 +37,7 @@
|
|||||||
#include <fmt/format.h>
|
#include <fmt/format.h>
|
||||||
|
|
||||||
#include "dwarfs/block_data.h"
|
#include "dwarfs/block_data.h"
|
||||||
|
#include "dwarfs/categorizer.h"
|
||||||
#include "dwarfs/entry.h"
|
#include "dwarfs/entry.h"
|
||||||
#include "dwarfs/error.h"
|
#include "dwarfs/error.h"
|
||||||
#include "dwarfs/file_scanner.h"
|
#include "dwarfs/file_scanner.h"
|
||||||
@ -606,6 +607,14 @@ void scanner_<LoggerPolicy>::scan(
|
|||||||
<< prog.duplicate_files << "/" << prog.files_found
|
<< prog.duplicate_files << "/" << prog.files_found
|
||||||
<< " duplicate files";
|
<< " duplicate files";
|
||||||
|
|
||||||
|
if (options_.inode.categorizer_mgr) {
|
||||||
|
for (auto const& cc : im.category_counts()) {
|
||||||
|
LOG_INFO << cc.second << " "
|
||||||
|
<< options_.inode.categorizer_mgr->category_name(cc.first)
|
||||||
|
<< " files";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
global_entry_data ge_data(options_);
|
global_entry_data ge_data(options_);
|
||||||
thrift::metadata::metadata mv2;
|
thrift::metadata::metadata mv2;
|
||||||
|
|
||||||
|
@ -53,6 +53,7 @@
|
|||||||
#include "dwarfs/block_compressor.h"
|
#include "dwarfs/block_compressor.h"
|
||||||
#include "dwarfs/block_manager.h"
|
#include "dwarfs/block_manager.h"
|
||||||
#include "dwarfs/builtin_script.h"
|
#include "dwarfs/builtin_script.h"
|
||||||
|
#include "dwarfs/categorizer.h"
|
||||||
#include "dwarfs/chmod_transformer.h"
|
#include "dwarfs/chmod_transformer.h"
|
||||||
#include "dwarfs/console_writer.h"
|
#include "dwarfs/console_writer.h"
|
||||||
#include "dwarfs/entry.h"
|
#include "dwarfs/entry.h"
|
||||||
@ -318,7 +319,7 @@ int mkdwarfs_main(int argc, sys_char** argv) {
|
|||||||
std::vector<sys_string> filter;
|
std::vector<sys_string> filter;
|
||||||
size_t num_workers, num_scanner_workers;
|
size_t num_workers, num_scanner_workers;
|
||||||
bool no_progress = false, remove_header = false, no_section_index = false,
|
bool no_progress = false, remove_header = false, no_section_index = false,
|
||||||
force_overwrite = false;
|
force_overwrite = false, enable_categorizer = false;
|
||||||
unsigned level;
|
unsigned level;
|
||||||
int compress_niceness;
|
int compress_niceness;
|
||||||
uint16_t uid, gid;
|
uint16_t uid, gid;
|
||||||
@ -391,6 +392,9 @@ int mkdwarfs_main(int argc, sys_char** argv) {
|
|||||||
("recompress",
|
("recompress",
|
||||||
po::value<std::string>(&recompress_opts)->implicit_value("all"),
|
po::value<std::string>(&recompress_opts)->implicit_value("all"),
|
||||||
"recompress an existing filesystem (none, block, metadata, all)")
|
"recompress an existing filesystem (none, block, metadata, all)")
|
||||||
|
("categorize",
|
||||||
|
po::value<bool>(&enable_categorizer)->zero_tokens(),
|
||||||
|
"WIP enable categorizer")
|
||||||
("order",
|
("order",
|
||||||
po::value<std::string>(&order),
|
po::value<std::string>(&order),
|
||||||
order_desc.c_str())
|
order_desc.c_str())
|
||||||
@ -510,6 +514,9 @@ int mkdwarfs_main(int argc, sys_char** argv) {
|
|||||||
.add(filesystem_opts)
|
.add(filesystem_opts)
|
||||||
.add(metadata_opts);
|
.add(metadata_opts);
|
||||||
|
|
||||||
|
auto& catreg = categorizer_registry::instance();
|
||||||
|
catreg.add_options(opts);
|
||||||
|
|
||||||
po::variables_map vm;
|
po::variables_map vm;
|
||||||
|
|
||||||
auto& sys_err_out = SYS_CERR;
|
auto& sys_err_out = SYS_CERR;
|
||||||
@ -1021,6 +1028,14 @@ int mkdwarfs_main(int argc, sys_char** argv) {
|
|||||||
options.file_order.mode == file_order_mode::SIMILARITY;
|
options.file_order.mode == file_order_mode::SIMILARITY;
|
||||||
options.inode.with_nilsimsa =
|
options.inode.with_nilsimsa =
|
||||||
options.file_order.mode == file_order_mode::NILSIMSA;
|
options.file_order.mode == file_order_mode::NILSIMSA;
|
||||||
|
if (enable_categorizer) {
|
||||||
|
options.inode.categorizer_mgr =
|
||||||
|
std::make_shared<categorizer_manager>(lgr);
|
||||||
|
// TODO
|
||||||
|
for (auto const& name : catreg.categorizer_names()) {
|
||||||
|
options.inode.categorizer_mgr->add(catreg.create(lgr, name, vm));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
scanner s(lgr, wg_scanner, cfg, entry_factory::create(),
|
scanner s(lgr, wg_scanner, cfg, entry_factory::create(),
|
||||||
std::make_shared<os_access_generic>(), std::move(script),
|
std::make_shared<os_access_generic>(), std::move(script),
|
||||||
@ -1031,6 +1046,8 @@ int mkdwarfs_main(int argc, sys_char** argv) {
|
|||||||
} else {
|
} else {
|
||||||
s.scan(fsw, path, prog);
|
s.scan(fsw, path, prog);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
options.inode.categorizer_mgr.reset();
|
||||||
}
|
}
|
||||||
} catch (runtime_error const& e) {
|
} catch (runtime_error const& e) {
|
||||||
LOG_ERROR << e.what();
|
LOG_ERROR << e.what();
|
||||||
|
@ -17,6 +17,7 @@
|
|||||||
"glog",
|
"glog",
|
||||||
"libarchive",
|
"libarchive",
|
||||||
"libevent",
|
"libevent",
|
||||||
|
"libmagic",
|
||||||
"openssl",
|
"openssl",
|
||||||
"pkgconf",
|
"pkgconf",
|
||||||
"utfcpp",
|
"utfcpp",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user