mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-13 06:16:55 -04:00
Switch categorizers to returning inode fragments
This commit is contained in:
parent
1ac36bb6fa
commit
33b2dfb95f
@ -373,6 +373,7 @@ list(
|
||||
src/dwarfs/fstypes.cpp
|
||||
src/dwarfs/fs_section.cpp
|
||||
src/dwarfs/global_entry_data.cpp
|
||||
src/dwarfs/inode_fragments.cpp
|
||||
src/dwarfs/inode_manager.cpp
|
||||
src/dwarfs/inode_reader_v2.cpp
|
||||
src/dwarfs/logger.cpp
|
||||
|
@ -23,14 +23,14 @@
|
||||
|
||||
#include <cstdint>
|
||||
#include <filesystem>
|
||||
#include <functional>
|
||||
#include <limits>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <span>
|
||||
#include <string_view>
|
||||
|
||||
#include "dwarfs/file_category.h"
|
||||
#include "dwarfs/inode_fragments.h"
|
||||
|
||||
namespace boost::program_options {
|
||||
class options_description;
|
||||
@ -41,6 +41,9 @@ namespace dwarfs {
|
||||
|
||||
class logger;
|
||||
|
||||
using category_mapper =
|
||||
std::function<fragment_category::value_type(std::string_view)>;
|
||||
|
||||
class categorizer {
|
||||
public:
|
||||
virtual ~categorizer() = default;
|
||||
@ -50,23 +53,26 @@ class categorizer {
|
||||
|
||||
class random_access_categorizer : public categorizer {
|
||||
public:
|
||||
virtual std::optional<std::string_view>
|
||||
categorize(std::filesystem::path const& path,
|
||||
std::span<uint8_t const> data) const = 0;
|
||||
virtual inode_fragments
|
||||
categorize(std::filesystem::path const& path, std::span<uint8_t const> data,
|
||||
category_mapper const& mapper) const = 0;
|
||||
};
|
||||
|
||||
// TODO: add call to check if categorizer can return multiple fragments
|
||||
// if it *can* we must run it before we start similarity hashing
|
||||
class sequential_categorizer_job {
|
||||
public:
|
||||
virtual ~sequential_categorizer_job() = default;
|
||||
|
||||
virtual void add(std::span<uint8_t const> data) = 0;
|
||||
virtual std::optional<std::string_view> result() = 0;
|
||||
virtual inode_fragments result() = 0;
|
||||
};
|
||||
|
||||
class sequential_categorizer : public categorizer {
|
||||
public:
|
||||
virtual std::unique_ptr<sequential_categorizer_job>
|
||||
job(std::filesystem::path const& path, size_t total_size) const = 0;
|
||||
job(std::filesystem::path const& path, size_t total_size,
|
||||
category_mapper const& mapper) const = 0;
|
||||
};
|
||||
|
||||
class categorizer_job {
|
||||
@ -76,6 +82,10 @@ class categorizer_job {
|
||||
categorizer_job();
|
||||
categorizer_job(std::unique_ptr<impl> impl);
|
||||
|
||||
void set_total_size(size_t total_size) {
|
||||
return impl_->set_total_size(total_size);
|
||||
}
|
||||
|
||||
void categorize_random_access(std::span<uint8_t const> data) {
|
||||
return impl_->categorize_random_access(data);
|
||||
}
|
||||
@ -84,7 +94,7 @@ class categorizer_job {
|
||||
return impl_->categorize_sequential(data);
|
||||
}
|
||||
|
||||
file_category result() { return impl_->result(); }
|
||||
inode_fragments result() { return impl_->result(); }
|
||||
|
||||
explicit operator bool() const { return impl_ != nullptr; }
|
||||
|
||||
@ -92,9 +102,10 @@ class categorizer_job {
|
||||
public:
|
||||
virtual ~impl() = default;
|
||||
|
||||
virtual void set_total_size(size_t total_size) = 0;
|
||||
virtual void categorize_random_access(std::span<uint8_t const> data) = 0;
|
||||
virtual void categorize_sequential(std::span<uint8_t const> data) = 0;
|
||||
virtual file_category result() = 0;
|
||||
virtual inode_fragments result() = 0;
|
||||
};
|
||||
|
||||
private:
|
||||
@ -111,7 +122,7 @@ class categorizer_manager {
|
||||
return impl_->job(path);
|
||||
}
|
||||
|
||||
std::string_view category_name(file_category c) const {
|
||||
std::string_view category_name(fragment_category::value_type c) const {
|
||||
return impl_->category_name(c);
|
||||
}
|
||||
|
||||
@ -121,7 +132,8 @@ class categorizer_manager {
|
||||
|
||||
virtual void add(std::shared_ptr<categorizer const> c) = 0;
|
||||
virtual categorizer_job job(std::filesystem::path const& path) const = 0;
|
||||
virtual std::string_view category_name(file_category c) const = 0;
|
||||
virtual std::string_view
|
||||
category_name(fragment_category::value_type c) const = 0;
|
||||
};
|
||||
|
||||
private:
|
||||
|
@ -21,12 +21,13 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
|
||||
namespace dwarfs {
|
||||
|
||||
class file_category {
|
||||
class fragment_category {
|
||||
public:
|
||||
using value_type = uint32_t;
|
||||
|
||||
@ -36,37 +37,60 @@ class file_category {
|
||||
static constexpr value_type const max{std::numeric_limits<value_type>::max() -
|
||||
1};
|
||||
|
||||
file_category()
|
||||
: value_{uninitialized} {}
|
||||
file_category(value_type v)
|
||||
fragment_category() = default;
|
||||
|
||||
explicit fragment_category(value_type v)
|
||||
: value_{v} {}
|
||||
|
||||
file_category(file_category const&) = default;
|
||||
file_category(file_category&&) = default;
|
||||
fragment_category(value_type v, value_type subcategory)
|
||||
: value_{v}
|
||||
, subcategory_{subcategory} {}
|
||||
|
||||
file_category& operator=(file_category const&) = default;
|
||||
file_category& operator=(file_category&&) = default;
|
||||
fragment_category(fragment_category const&) = default;
|
||||
fragment_category(fragment_category&&) = default;
|
||||
|
||||
file_category& operator=(value_type v) {
|
||||
fragment_category& operator=(fragment_category const&) = default;
|
||||
fragment_category& operator=(fragment_category&&) = default;
|
||||
|
||||
fragment_category& operator=(value_type v) {
|
||||
assert(v != uninitialized);
|
||||
value_ = v;
|
||||
return *this;
|
||||
}
|
||||
|
||||
value_type value() const {
|
||||
if (empty()) {
|
||||
throw std::range_error("file_category is uninitialized");
|
||||
}
|
||||
assert(!empty());
|
||||
return value_;
|
||||
}
|
||||
|
||||
void clear() { value_ = uninitialized; }
|
||||
void clear() {
|
||||
value_ = uninitialized;
|
||||
subcategory_ = uninitialized;
|
||||
}
|
||||
|
||||
bool empty() const { return value_ == uninitialized; }
|
||||
|
||||
explicit operator bool() const { return !empty(); }
|
||||
|
||||
void set_subcategory(value_type subcategory) {
|
||||
assert(!empty());
|
||||
assert(subcategory != uninitialized);
|
||||
subcategory_ = subcategory;
|
||||
}
|
||||
|
||||
bool has_subcategory() const {
|
||||
return !empty() && subcategory_ != uninitialized;
|
||||
}
|
||||
|
||||
value_type subcategory() const {
|
||||
assert(!empty());
|
||||
assert(subcategory_ != uninitialized);
|
||||
return subcategory_;
|
||||
}
|
||||
|
||||
private:
|
||||
value_type value_;
|
||||
value_type value_{uninitialized};
|
||||
value_type subcategory_{uninitialized};
|
||||
};
|
||||
|
||||
} // namespace dwarfs
|
@ -27,7 +27,7 @@
|
||||
|
||||
#include <folly/small_vector.h>
|
||||
|
||||
#include "dwarfs/file_category.h"
|
||||
#include "dwarfs/inode_fragments.h"
|
||||
#include "dwarfs/nilsimsa.h"
|
||||
#include "dwarfs/object.h"
|
||||
|
||||
@ -59,7 +59,7 @@ class inode : public object {
|
||||
virtual void add_chunk(size_t block, size_t offset, size_t size) = 0;
|
||||
virtual void
|
||||
append_chunks_to(std::vector<thrift::metadata::chunk>& vec) const = 0;
|
||||
virtual file_category category() const = 0;
|
||||
virtual inode_fragments const& fragments() const = 0;
|
||||
};
|
||||
|
||||
} // namespace dwarfs
|
||||
|
83
include/dwarfs/inode_fragments.h
Normal file
83
include/dwarfs/inode_fragments.h
Normal file
@ -0,0 +1,83 @@
|
||||
/* vim:set ts=2 sw=2 sts=2 et: */
|
||||
/**
|
||||
* \author Marcus Holland-Moritz (github@mhxnet.de)
|
||||
* \copyright Copyright (c) Marcus Holland-Moritz
|
||||
*
|
||||
* This file is part of dwarfs.
|
||||
*
|
||||
* dwarfs is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* dwarfs is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include <iosfwd>
|
||||
#include <span>
|
||||
#include <string>
|
||||
|
||||
#include <folly/small_vector.h>
|
||||
|
||||
#include "dwarfs/fragment_category.h"
|
||||
#include "dwarfs/types.h"
|
||||
|
||||
namespace dwarfs {
|
||||
|
||||
class single_inode_fragment {
|
||||
public:
|
||||
single_inode_fragment(fragment_category category, file_off_t length)
|
||||
: category_{category}
|
||||
, length_{length} {}
|
||||
|
||||
fragment_category category() const { return category_; }
|
||||
file_off_t length() const { return length_; }
|
||||
file_off_t size() const { return length_; }
|
||||
|
||||
private:
|
||||
fragment_category category_;
|
||||
file_off_t length_;
|
||||
};
|
||||
|
||||
class inode_fragments {
|
||||
public:
|
||||
using mapper_function_type =
|
||||
std::function<std::string(fragment_category::value_type)>;
|
||||
|
||||
inode_fragments() = default;
|
||||
|
||||
single_inode_fragment&
|
||||
emplace_back(fragment_category category, file_off_t length) {
|
||||
return fragments_.emplace_back(category, length);
|
||||
}
|
||||
|
||||
std::span<single_inode_fragment const> span() const { return fragments_; }
|
||||
|
||||
bool empty() const { return fragments_.empty(); }
|
||||
|
||||
explicit operator bool() const { return !empty(); }
|
||||
|
||||
std::ostream&
|
||||
to_stream(std::ostream& os,
|
||||
mapper_function_type const& mapper = mapper_function_type()) const;
|
||||
std::string
|
||||
to_string(mapper_function_type const& mapper = mapper_function_type()) const;
|
||||
|
||||
private:
|
||||
folly::small_vector<single_inode_fragment, 1> fragments_;
|
||||
};
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, inode_fragments const& frag) {
|
||||
return frag.to_stream(os);
|
||||
}
|
||||
|
||||
} // namespace dwarfs
|
@ -28,7 +28,7 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "dwarfs/file_category.h"
|
||||
#include "dwarfs/fragment_category.h"
|
||||
|
||||
namespace dwarfs {
|
||||
|
||||
@ -59,7 +59,8 @@ class inode_manager {
|
||||
impl_->for_each_inode_in_order(fn);
|
||||
}
|
||||
|
||||
std::vector<std::pair<file_category, size_t>> category_counts() const {
|
||||
std::vector<std::pair<fragment_category::value_type, size_t>>
|
||||
category_counts() const {
|
||||
return impl_->category_counts();
|
||||
}
|
||||
|
||||
@ -74,7 +75,7 @@ class inode_manager {
|
||||
file_order_options const& file_order, order_cb const& fn) = 0;
|
||||
virtual void for_each_inode_in_order(
|
||||
std::function<void(std::shared_ptr<inode> const&)> const& fn) const = 0;
|
||||
virtual std::vector<std::pair<file_category, size_t>>
|
||||
virtual std::vector<std::pair<fragment_category::value_type, size_t>>
|
||||
category_counts() const = 0;
|
||||
};
|
||||
|
||||
|
@ -35,19 +35,20 @@
|
||||
|
||||
namespace dwarfs {
|
||||
|
||||
using namespace std::placeholders;
|
||||
|
||||
namespace po = boost::program_options;
|
||||
|
||||
namespace {
|
||||
constexpr std::string_view const DEFAULT_CATEGORY{"<default>"};
|
||||
}
|
||||
|
||||
class categorizer_manager_private {
|
||||
class categorizer_manager_private : public categorizer_manager::impl {
|
||||
public:
|
||||
virtual ~categorizer_manager_private() = default;
|
||||
|
||||
virtual std::vector<std::shared_ptr<categorizer const>> const&
|
||||
categorizers() const = 0;
|
||||
virtual file_category category(std::string_view cat) const = 0;
|
||||
virtual fragment_category::value_type
|
||||
category(std::string_view cat) const = 0;
|
||||
};
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
@ -57,39 +58,48 @@ class categorizer_job_ final : public categorizer_job::impl {
|
||||
std::filesystem::path const& path)
|
||||
: LOG_PROXY_INIT(lgr)
|
||||
, mgr_{mgr}
|
||||
, path_{path} {}
|
||||
, path_{path}
|
||||
, cat_mapper_{std::bind(&categorizer_manager_private::category,
|
||||
std::cref(mgr_), _1)} {}
|
||||
|
||||
void set_total_size(size_t total_size) override;
|
||||
void categorize_random_access(std::span<uint8_t const> data) override;
|
||||
void categorize_sequential(std::span<uint8_t const> data) override;
|
||||
file_category result() override;
|
||||
inode_fragments result() override;
|
||||
|
||||
private:
|
||||
LOG_PROXY_DECL(LoggerPolicy);
|
||||
categorizer_manager_private const& mgr_;
|
||||
|
||||
std::string_view best_{DEFAULT_CATEGORY};
|
||||
inode_fragments best_;
|
||||
int index_{-1};
|
||||
bool is_global_best_{false};
|
||||
size_t total_size_hint_{0};
|
||||
size_t total_size_{0};
|
||||
std::vector<std::pair<int, std::unique_ptr<sequential_categorizer_job>>>
|
||||
seq_jobs_;
|
||||
std::filesystem::path const path_;
|
||||
category_mapper cat_mapper_;
|
||||
};
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
void categorizer_job_<LoggerPolicy>::set_total_size(size_t total_size) {
|
||||
total_size_ = total_size;
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
void categorizer_job_<LoggerPolicy>::categorize_random_access(
|
||||
std::span<uint8_t const> data) {
|
||||
DWARFS_CHECK(index_ < 0,
|
||||
"internal error: index already set in categorize_random_access");
|
||||
|
||||
total_size_hint_ = data.size();
|
||||
total_size_ = data.size();
|
||||
|
||||
bool global_best = true;
|
||||
|
||||
for (auto&& [index, cat] : folly::enumerate(mgr_.categorizers())) {
|
||||
if (auto p = dynamic_cast<random_access_categorizer const*>(cat.get())) {
|
||||
if (auto c = p->categorize(path_, data)) {
|
||||
best_ = *c;
|
||||
if (auto c = p->categorize(path_, data, cat_mapper_)) {
|
||||
best_ = c;
|
||||
index_ = index;
|
||||
is_global_best_ = global_best;
|
||||
break;
|
||||
@ -114,7 +124,7 @@ void categorizer_job_<LoggerPolicy>::categorize_sequential(
|
||||
}
|
||||
|
||||
if (auto p = dynamic_cast<sequential_categorizer const*>(cat.get())) {
|
||||
if (auto job = p->job(path_, total_size_hint_)) {
|
||||
if (auto job = p->job(path_, total_size_, cat_mapper_)) {
|
||||
seq_jobs_.emplace_back(index, std::move(job));
|
||||
}
|
||||
}
|
||||
@ -127,12 +137,12 @@ void categorizer_job_<LoggerPolicy>::categorize_sequential(
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
file_category categorizer_job_<LoggerPolicy>::result() {
|
||||
inode_fragments categorizer_job_<LoggerPolicy>::result() {
|
||||
if (!seq_jobs_.empty()) {
|
||||
for (auto&& [index, job] : seq_jobs_) {
|
||||
if (auto c = job->result()) {
|
||||
assert(index_ < 0 || index < index_);
|
||||
best_ = *c;
|
||||
best_ = c;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -140,9 +150,12 @@ file_category categorizer_job_<LoggerPolicy>::result() {
|
||||
seq_jobs_.clear();
|
||||
}
|
||||
|
||||
LOG_TRACE << path_ << " -> " << best_;
|
||||
LOG_TRACE << path_ << " -> "
|
||||
<< best_.to_string([this](fragment_category::value_type c) {
|
||||
return std::string(mgr_.category_name(c));
|
||||
});
|
||||
|
||||
return mgr_.category(best_);
|
||||
return best_;
|
||||
}
|
||||
|
||||
categorizer_job::categorizer_job() = default;
|
||||
@ -151,8 +164,7 @@ categorizer_job::categorizer_job(std::unique_ptr<impl> impl)
|
||||
: impl_{std::move(impl)} {}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
class categorizer_manager_ final : public categorizer_manager::impl,
|
||||
public categorizer_manager_private {
|
||||
class categorizer_manager_ final : public categorizer_manager_private {
|
||||
public:
|
||||
categorizer_manager_(logger& lgr)
|
||||
: lgr_{lgr}
|
||||
@ -162,14 +174,15 @@ class categorizer_manager_ final : public categorizer_manager::impl,
|
||||
|
||||
void add(std::shared_ptr<categorizer const> c) override;
|
||||
categorizer_job job(std::filesystem::path const& path) const override;
|
||||
std::string_view category_name(file_category c) const override;
|
||||
std::string_view
|
||||
category_name(fragment_category::value_type c) const override;
|
||||
|
||||
std::vector<std::shared_ptr<categorizer const>> const&
|
||||
categorizers() const override {
|
||||
return categorizers_;
|
||||
}
|
||||
|
||||
file_category category(std::string_view cat) const override {
|
||||
fragment_category::value_type category(std::string_view cat) const override {
|
||||
auto it = catmap_.find(cat);
|
||||
DWARFS_CHECK(it != catmap_.end(), fmt::format("unknown category: {}", cat));
|
||||
return it->second;
|
||||
@ -188,7 +201,7 @@ class categorizer_manager_ final : public categorizer_manager::impl,
|
||||
LOG_PROXY_DECL(LoggerPolicy);
|
||||
std::vector<std::shared_ptr<categorizer const>> categorizers_;
|
||||
std::vector<std::string_view> categories_;
|
||||
std::unordered_map<std::string_view, file_category> catmap_;
|
||||
std::unordered_map<std::string_view, fragment_category::value_type> catmap_;
|
||||
};
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
@ -210,9 +223,9 @@ categorizer_job categorizer_manager_<LoggerPolicy>::job(
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
std::string_view
|
||||
categorizer_manager_<LoggerPolicy>::category_name(file_category c) const {
|
||||
return DWARFS_NOTHROW(categories_.at(c.value()));
|
||||
std::string_view categorizer_manager_<LoggerPolicy>::category_name(
|
||||
fragment_category::value_type c) const {
|
||||
return DWARFS_NOTHROW(categories_.at(c));
|
||||
}
|
||||
|
||||
categorizer_manager::categorizer_manager(logger& lgr)
|
||||
|
@ -58,9 +58,9 @@ class binary_categorizer_ final : public binary_categorizer_base {
|
||||
binary_categorizer_(logger& lgr)
|
||||
: LOG_PROXY_INIT(lgr) {}
|
||||
|
||||
std::optional<std::string_view>
|
||||
categorize(std::filesystem::path const& path,
|
||||
std::span<uint8_t const> data) const override;
|
||||
inode_fragments
|
||||
categorize(std::filesystem::path const& path, std::span<uint8_t const> data,
|
||||
category_mapper const& mapper) const override;
|
||||
|
||||
private:
|
||||
LOG_PROXY_DECL(LoggerPolicy);
|
||||
@ -74,10 +74,12 @@ std::span<std::string_view const> binary_categorizer_base::categories() const {
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
std::optional<std::string_view>
|
||||
binary_categorizer_<LoggerPolicy>::categorize(std::filesystem::path const&,
|
||||
std::span<uint8_t const> data
|
||||
[[maybe_unused]]) const {
|
||||
inode_fragments binary_categorizer_<LoggerPolicy>::categorize(
|
||||
std::filesystem::path const&,
|
||||
std::span<uint8_t const> data [[maybe_unused]],
|
||||
category_mapper const& /*mapper*/) const {
|
||||
inode_fragments fragments;
|
||||
|
||||
#ifndef _WIN32
|
||||
auto p = data.data();
|
||||
if (data.size() >= EI_NIDENT && ::memcmp(p, ELFMAG, 4) == 0) {
|
||||
@ -101,7 +103,7 @@ binary_categorizer_<LoggerPolicy>::categorize(std::filesystem::path const&,
|
||||
}
|
||||
#endif
|
||||
|
||||
return std::nullopt;
|
||||
return fragments;
|
||||
}
|
||||
|
||||
class binary_categorizer_factory : public categorizer_factory {
|
||||
|
@ -57,10 +57,12 @@ class incompressible_categorizer_job_ : public sequential_categorizer_job {
|
||||
incompressible_categorizer_job_(logger& lgr,
|
||||
incompressible_categorizer_config const& cfg,
|
||||
std::filesystem::path const& path,
|
||||
size_t total_size)
|
||||
size_t total_size,
|
||||
category_mapper const& mapper)
|
||||
: LOG_PROXY_INIT(lgr)
|
||||
, cfg_{cfg}
|
||||
, path_{path} {
|
||||
, path_{path}
|
||||
, mapper_{mapper} {
|
||||
input_.reserve(total_size < block_size ? total_size : block_size);
|
||||
state_ = ::malloc(LZ4_sizeofState());
|
||||
}
|
||||
@ -77,7 +79,8 @@ class incompressible_categorizer_job_ : public sequential_categorizer_job {
|
||||
}
|
||||
}
|
||||
|
||||
std::optional<std::string_view> result() override {
|
||||
inode_fragments result() override {
|
||||
inode_fragments fragments;
|
||||
if (!input_.empty()) {
|
||||
compress();
|
||||
}
|
||||
@ -88,9 +91,11 @@ class incompressible_categorizer_job_ : public sequential_categorizer_job {
|
||||
if (total_blocks_ > 0 &&
|
||||
(total_output_size_ >= cfg_.max_ratio_size * total_input_size_ ||
|
||||
incompressible_blocks_ >= cfg_.max_ratio_blocks * total_blocks_)) {
|
||||
return INCOMPRESSIBLE_CATEGORY;
|
||||
fragments.emplace_back(
|
||||
fragment_category(mapper_(INCOMPRESSIBLE_CATEGORY)),
|
||||
total_input_size_);
|
||||
}
|
||||
return std::nullopt;
|
||||
return fragments;
|
||||
}
|
||||
|
||||
private:
|
||||
@ -139,6 +144,7 @@ class incompressible_categorizer_job_ : public sequential_categorizer_job {
|
||||
size_t incompressible_blocks_{0};
|
||||
incompressible_categorizer_config const& cfg_;
|
||||
std::filesystem::path const& path_;
|
||||
category_mapper const& mapper_;
|
||||
};
|
||||
|
||||
class incompressible_categorizer_ final : public sequential_categorizer {
|
||||
@ -148,7 +154,8 @@ class incompressible_categorizer_ final : public sequential_categorizer {
|
||||
|
||||
std::span<std::string_view const> categories() const override;
|
||||
std::unique_ptr<sequential_categorizer_job>
|
||||
job(std::filesystem::path const& path, size_t total_size) const override;
|
||||
job(std::filesystem::path const& path, size_t total_size,
|
||||
category_mapper const& mapper) const override;
|
||||
|
||||
private:
|
||||
logger& lgr_;
|
||||
@ -170,7 +177,8 @@ incompressible_categorizer_::categories() const {
|
||||
|
||||
std::unique_ptr<sequential_categorizer_job>
|
||||
incompressible_categorizer_::job(std::filesystem::path const& path,
|
||||
size_t total_size) const {
|
||||
size_t total_size,
|
||||
category_mapper const& mapper) const {
|
||||
if (total_size < config_.min_input_size) {
|
||||
return nullptr;
|
||||
}
|
||||
@ -178,7 +186,7 @@ incompressible_categorizer_::job(std::filesystem::path const& path,
|
||||
return make_unique_logging_object<sequential_categorizer_job,
|
||||
incompressible_categorizer_job_,
|
||||
logger_policies>(lgr_, config_, path,
|
||||
total_size);
|
||||
total_size, mapper);
|
||||
}
|
||||
|
||||
class incompressible_categorizer_factory : public categorizer_factory {
|
||||
|
@ -143,9 +143,9 @@ class libmagic_categorizer_ final : public libmagic_categorizer_base {
|
||||
}
|
||||
}
|
||||
|
||||
std::optional<std::string_view>
|
||||
categorize(std::filesystem::path const& path,
|
||||
std::span<uint8_t const> data) const override;
|
||||
inode_fragments
|
||||
categorize(std::filesystem::path const& path, std::span<uint8_t const> data,
|
||||
category_mapper const& mapper) const override;
|
||||
|
||||
private:
|
||||
LOG_PROXY_DECL(LoggerPolicy);
|
||||
@ -162,15 +162,17 @@ libmagic_categorizer_base::categories() const {
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
std::optional<std::string_view> libmagic_categorizer_<LoggerPolicy>::categorize(
|
||||
std::filesystem::path const& path, std::span<uint8_t const> data) const {
|
||||
inode_fragments libmagic_categorizer_<LoggerPolicy>::categorize(
|
||||
std::filesystem::path const& path, std::span<uint8_t const> data,
|
||||
category_mapper const& /*mapper*/) const {
|
||||
inode_fragments fragments; // TODO: actually fill this :-)
|
||||
auto id = m_.identify(data);
|
||||
LOG_DEBUG << path << " -> (magic) " << id;
|
||||
{
|
||||
auto wlock = mimetypes_.wlock();
|
||||
++(*wlock)[id];
|
||||
}
|
||||
return std::nullopt;
|
||||
return fragments;
|
||||
}
|
||||
|
||||
class libmagic_categorizer_factory : public categorizer_factory {
|
||||
|
74
src/dwarfs/inode_fragments.cpp
Normal file
74
src/dwarfs/inode_fragments.cpp
Normal file
@ -0,0 +1,74 @@
|
||||
/* vim:set ts=2 sw=2 sts=2 et: */
|
||||
/**
|
||||
* \author Marcus Holland-Moritz (github@mhxnet.de)
|
||||
* \copyright Copyright (c) Marcus Holland-Moritz
|
||||
*
|
||||
* This file is part of dwarfs.
|
||||
*
|
||||
* dwarfs is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* dwarfs is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <ostream>
|
||||
#include <sstream>
|
||||
|
||||
#include "dwarfs/inode_fragments.h"
|
||||
|
||||
namespace dwarfs {
|
||||
|
||||
std::ostream&
|
||||
inode_fragments::to_stream(std::ostream& os,
|
||||
mapper_function_type const& mapper) const {
|
||||
if (empty()) {
|
||||
os << "(empty)";
|
||||
} else {
|
||||
os << "[";
|
||||
bool first = true;
|
||||
|
||||
for (auto const& f : span()) {
|
||||
if (first) {
|
||||
first = false;
|
||||
} else {
|
||||
os << ", ";
|
||||
}
|
||||
|
||||
os << "(";
|
||||
|
||||
auto const& cat = f.category();
|
||||
if (mapper) {
|
||||
os << mapper(cat.value());
|
||||
} else {
|
||||
os << cat.value();
|
||||
}
|
||||
|
||||
if (cat.has_subcategory()) {
|
||||
os << "/" << cat.subcategory();
|
||||
}
|
||||
|
||||
os << ", " << f.size() << ")";
|
||||
}
|
||||
|
||||
os << "]";
|
||||
}
|
||||
|
||||
return os;
|
||||
}
|
||||
|
||||
std::string
|
||||
inode_fragments::to_string(mapper_function_type const& mapper) const {
|
||||
std::ostringstream oss;
|
||||
to_stream(oss, mapper);
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
} // namespace dwarfs
|
@ -155,6 +155,7 @@ class inode_ : public inode {
|
||||
|
||||
if (mm) {
|
||||
if (catjob) {
|
||||
catjob.set_total_size(mm->size());
|
||||
catjob.categorize_random_access(mm->span());
|
||||
}
|
||||
|
||||
@ -201,7 +202,7 @@ class inode_ : public inode {
|
||||
}
|
||||
|
||||
if (catjob) {
|
||||
category_ = catjob.result();
|
||||
fragments_ = catjob.result();
|
||||
}
|
||||
}
|
||||
|
||||
@ -228,12 +229,14 @@ class inode_ : public inode {
|
||||
vec.insert(vec.end(), chunks_.begin(), chunks_.end());
|
||||
}
|
||||
|
||||
file_category category() const override { return category_; }
|
||||
inode_fragments const& fragments() const override { return fragments_; }
|
||||
|
||||
private:
|
||||
// TODO: can we move optional stuff (e.g. nilsimsa_similarity_hash_) out of
|
||||
// here?
|
||||
std::optional<uint32_t> num_;
|
||||
uint32_t similarity_hash_{0};
|
||||
file_category category_;
|
||||
inode_fragments fragments_;
|
||||
files_vector files_;
|
||||
std::vector<chunk_type> chunks_;
|
||||
nilsimsa::hash_type nilsimsa_similarity_hash_;
|
||||
@ -278,23 +281,26 @@ class inode_manager_ final : public inode_manager::impl {
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::pair<file_category, size_t>>
|
||||
std::vector<std::pair<fragment_category::value_type, size_t>>
|
||||
category_counts() const override {
|
||||
std::unordered_map<file_category::value_type, size_t> tmp;
|
||||
std::unordered_map<fragment_category::value_type, size_t> tmp;
|
||||
|
||||
for (auto const& i : inodes_) {
|
||||
++tmp[i->category().value()];
|
||||
if (auto const& fragments = i->fragments(); !fragments.empty()) {
|
||||
for (auto const& frag : fragments.span()) {
|
||||
++tmp[frag.category().value()];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::pair<file_category, size_t>> rv;
|
||||
std::vector<std::pair<fragment_category::value_type, size_t>> rv;
|
||||
|
||||
for (auto const& [k, v] : tmp) {
|
||||
rv.emplace_back(k, v);
|
||||
}
|
||||
|
||||
std::sort(rv.begin(), rv.end(), [](auto const& a, auto const& b) {
|
||||
return a.first.value() < b.first.value();
|
||||
});
|
||||
std::sort(rv.begin(), rv.end(),
|
||||
[](auto const& a, auto const& b) { return a.first < b.first; });
|
||||
|
||||
return rv;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user