Better modeling of metadata requirements

This commit is contained in:
Marcus Holland-Moritz 2023-07-31 16:06:50 +02:00
parent e08faf2c0c
commit 9d5969adb7
20 changed files with 662 additions and 176 deletions

View File

@ -443,6 +443,12 @@ add_library(dwarfs_compression ${LIBDWARFS_COMPRESSION_SRC})
add_library(dwarfs_categorizer ${LIBDWARFS_CATEGORIZER_SRC})
add_library(dwarfs_tool src/dwarfs/tool.cpp)
add_library(dwarfs_compression_metadata src/dwarfs/compression_metadata_requirements.cpp)
target_link_libraries(dwarfs_compression_metadata folly)
target_link_libraries(dwarfs_categorizer dwarfs_compression_metadata)
target_link_libraries(dwarfs dwarfs_compression_metadata)
if(DWARFS_GIT_BUILD)
target_include_directories(dwarfs PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/include)
endif()
@ -810,7 +816,8 @@ target_link_libraries(metadata_thrift thrift_light)
target_link_libraries(compression_thrift thrift_light)
foreach(tgt dwarfs dwarfs_compression dwarfs_categorizer
dwarfs_tool ${BINARY_TARGETS} ${MAIN_TARGETS})
dwarfs_compression_metadata dwarfs_tool
${BINARY_TARGETS} ${MAIN_TARGETS})
target_include_directories(
${tgt} SYSTEM
PRIVATE ${Boost_INCLUDE_DIRS} ${Python3_INCLUDE_DIRS} ${INCLUDE_DIRS}

View File

@ -33,8 +33,6 @@
#include <utility>
#include <vector>
#include <folly/dynamic.h>
#include "dwarfs/compression.h"
namespace dwarfs {
@ -57,22 +55,30 @@ class block_compressor {
block_compressor(block_compressor&& bc) = default;
block_compressor& operator=(block_compressor&& rhs) = default;
std::vector<uint8_t>
compress(std::vector<uint8_t> const& data, folly::dynamic meta) const {
return impl_->compress(data, std::move(meta));
std::vector<uint8_t> compress(std::vector<uint8_t> const& data) const {
return impl_->compress(data, nullptr);
}
std::vector<uint8_t> compress(std::vector<uint8_t>&& data) const {
return impl_->compress(std::move(data), nullptr);
}
std::vector<uint8_t> compress(std::vector<uint8_t> const& data,
std::string const& metadata) const {
return impl_->compress(data, &metadata);
}
std::vector<uint8_t>
compress(std::vector<uint8_t>&& data, folly::dynamic meta) const {
return impl_->compress(std::move(data), std::move(meta));
compress(std::vector<uint8_t>&& data, std::string const& metadata) const {
return impl_->compress(std::move(data), &metadata);
}
compression_type type() const { return impl_->type(); }
std::string describe() const { return impl_->describe(); }
bool check_metadata(folly::dynamic meta) const {
return impl_->check_metadata(std::move(meta));
std::string metadata_requirements() const {
return impl_->metadata_requirements();
}
class impl {
@ -82,14 +88,16 @@ class block_compressor {
virtual std::unique_ptr<impl> clone() const = 0;
virtual std::vector<uint8_t>
compress(const std::vector<uint8_t>& data, folly::dynamic meta) const = 0;
compress(const std::vector<uint8_t>& data,
std::string const* metadata) const = 0;
virtual std::vector<uint8_t>
compress(std::vector<uint8_t>&& data, folly::dynamic meta) const = 0;
compress(std::vector<uint8_t>&& data,
std::string const* metadata) const = 0;
virtual compression_type type() const = 0;
virtual std::string describe() const = 0;
virtual bool check_metadata(folly::dynamic meta) const = 0;
virtual std::string metadata_requirements() const = 0;
};
private:

View File

@ -31,8 +31,6 @@
#include <span>
#include <string_view>
#include <folly/dynamic.h>
#include "dwarfs/inode_fragments.h"
namespace boost::program_options {
@ -53,9 +51,10 @@ class categorizer {
virtual std::span<std::string_view const> categories() const = 0;
virtual bool is_single_fragment() const = 0;
virtual folly::dynamic
category_metadata(std::string_view category_name,
std::optional<fragment_category> c) const = 0;
virtual std::string
category_metadata(std::string_view category_name, fragment_category c) const;
virtual void set_metadata_requirements(std::string_view category_name,
std::string requirements);
};
class random_access_categorizer : public categorizer {
@ -128,7 +127,7 @@ class categorizer_manager {
static fragment_category default_category();
void add(std::shared_ptr<categorizer const> c) { impl_->add(std::move(c)); }
void add(std::shared_ptr<categorizer> c) { impl_->add(std::move(c)); }
categorizer_job job(std::filesystem::path const& path) const {
return impl_->job(path);
@ -143,28 +142,28 @@ class categorizer_manager {
return impl_->category_value(name);
}
folly::dynamic category_metadata(fragment_category c) const {
std::string category_metadata(fragment_category c) const {
return impl_->category_metadata(c);
}
folly::dynamic
category_metadata_sample(fragment_category::value_type c) const {
return impl_->category_metadata_sample(c);
void
set_metadata_requirements(fragment_category::value_type c, std::string req) {
impl_->set_metadata_requirements(c, std::move(req));
}
class impl {
public:
virtual ~impl() = default;
virtual void add(std::shared_ptr<categorizer const> c) = 0;
virtual void add(std::shared_ptr<categorizer> c) = 0;
virtual categorizer_job job(std::filesystem::path const& path) const = 0;
virtual std::string_view
category_name(fragment_category::value_type c) const = 0;
virtual std::optional<fragment_category::value_type>
category_value(std::string_view name) const = 0;
virtual folly::dynamic category_metadata(fragment_category c) const = 0;
virtual folly::dynamic
category_metadata_sample(fragment_category::value_type c) const = 0;
virtual std::string category_metadata(fragment_category c) const = 0;
virtual void set_metadata_requirements(fragment_category::value_type c,
std::string req) = 0;
};
private:

View File

@ -0,0 +1,291 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/
#pragma once
#include <functional>
#include <optional>
#include <stdexcept>
#include <string>
#include <string_view>
#include <unordered_set>
#include <fmt/format.h>
#include <folly/dynamic.h>
namespace dwarfs {
namespace detail {
template <typename T>
std::optional<T> value_parser(folly::dynamic const& v) {
if constexpr (std::is_same_v<T, std::string>) {
return v.asString();
} else {
static_assert(std::is_integral_v<T>);
return v.asInt();
}
}
void check_dynamic_common(folly::dynamic const& dyn,
std::string_view expected_type, size_t expected_size,
std::string_view name);
void check_unsupported_metadata_requirements(folly::dynamic& req);
template <typename T, typename ValueParser>
bool parse_metadata_requirements_set(T& container, folly::dynamic& req,
std::string_view name,
ValueParser const& value_parser) {
if (auto it = req.find(name); it != req.items().end()) {
detail::check_dynamic_common(it->second, "set", 2, name);
if (it->second[1].type() != folly::dynamic::ARRAY) {
throw std::runtime_error(
fmt::format("non-array type argument for requirement '{}'", name));
}
for (auto v : it->second[1]) {
if (auto maybe_value = value_parser(v)) {
if (!container.emplace(*maybe_value).second) {
throw std::runtime_error(fmt::format(
"duplicate value '{}' for requirement '{}'", v.asString(), name));
}
}
}
req.erase(it);
return true;
}
return false;
}
template <typename T, typename ValueParser>
bool parse_metadata_requirements_range(T& min, T& max, folly::dynamic& req,
std::string_view name,
ValueParser const& value_parser) {
if (auto it = req.find(name); it != req.items().end()) {
detail::check_dynamic_common(it->second, "range", 3, name);
auto get_value = [&](std::string_view what, int index) {
if (auto maybe_value = value_parser(it->second[index])) {
return *maybe_value;
}
throw std::runtime_error(
fmt::format("could not parse {} value '{}' for requirement '{}'",
what, it->second[index].asString(), name));
};
min = get_value("minimum", 1);
max = get_value("maximum", 2);
if (min > max) {
throw std::runtime_error(fmt::format(
"expected minimum '{}' to be less than or equal "
"to maximum '{}' for requirement '{}'",
it->second[1].asString(), it->second[2].asString(), name));
}
req.erase(it);
return true;
}
return false;
}
class metadata_requirement_base {
public:
virtual ~metadata_requirement_base() = default;
metadata_requirement_base(std::string const& name)
: name_{name} {}
virtual void parse(folly::dynamic& req) = 0;
std::string_view name() const { return name_; }
private:
std::string const name_;
};
template <typename Meta>
class checked_metadata_requirement_base : public metadata_requirement_base {
public:
using metadata_requirement_base::metadata_requirement_base;
virtual void check(Meta const& m) const = 0;
};
template <typename Meta, typename T, typename U>
class typed_metadata_requirement_base
: public checked_metadata_requirement_base<Meta> {
public:
using value_parser_type =
std::function<std::optional<T>(folly::dynamic const& v)>;
using member_ptr_type = U(Meta::*);
typed_metadata_requirement_base(std::string const& name, member_ptr_type mp)
: checked_metadata_requirement_base<Meta>(name)
, mp_{mp}
, value_parser_{detail::value_parser<T>} {}
typed_metadata_requirement_base(std::string const& name, member_ptr_type mp,
value_parser_type value_parser)
: checked_metadata_requirement_base<Meta>(name)
, mp_{mp}
, value_parser_{value_parser} {}
void check(Meta const& m) const override { check_value(m.*mp_); }
value_parser_type const& value_parser() const { return value_parser_; }
protected:
virtual void check_value(T const& value) const = 0;
private:
member_ptr_type mp_;
value_parser_type value_parser_;
};
template <typename Meta, typename T, typename U = T>
class metadata_requirement_set
: public typed_metadata_requirement_base<Meta, T, U> {
public:
using typed_metadata_requirement_base<Meta, T,
U>::typed_metadata_requirement_base;
void parse(folly::dynamic& req) override {
set_.reset();
std::unordered_set<T> tmp;
if (parse_metadata_requirements_set(tmp, req, this->name(),
this->value_parser())) {
set_.emplace(std::move(tmp));
}
}
protected:
void check_value(T const& value) const override {
if (set_ && set_->count(value) == 0) {
throw std::range_error(fmt::format("{} '{}' does not meet requirements",
this->name(), value));
}
}
private:
std::optional<std::unordered_set<T>> set_;
};
template <typename Meta, typename T, typename U = T>
class metadata_requirement_range
: public typed_metadata_requirement_base<Meta, T, U> {
public:
using typed_metadata_requirement_base<Meta, T,
U>::typed_metadata_requirement_base;
void parse(folly::dynamic& req) override {
range_.reset();
T min, max;
if (parse_metadata_requirements_range(min, max, req, this->name(),
this->value_parser())) {
range_.emplace(min, max);
}
}
protected:
void check_value(T const& value) const override {
if (range_ && (value < range_->first || value > range_->second)) {
throw std::range_error(
fmt::format("{} '{}' does not meet requirements [{}..{}]",
this->name(), value, range_->first, range_->second));
}
}
private:
std::optional<std::pair<T, T>> range_;
};
} // namespace detail
template <typename Meta = void>
class compression_metadata_requirements {
public:
compression_metadata_requirements() = default;
template <
typename F, typename U,
typename T = typename std::invoke_result_t<F, folly::dynamic>::value_type>
void add_set(std::string const& name, U(Meta::*mp), F&& value_parser) {
req_.emplace_back(
std::make_unique<detail::metadata_requirement_set<Meta, T, U>>(
name, mp, std::forward<F>(value_parser)));
}
template <typename T, typename U>
void add_set(std::string const& name, U(Meta::*mp)) {
add_set(name, mp, detail::value_parser<T>);
}
template <
typename F, typename U,
typename T = typename std::invoke_result_t<F, folly::dynamic>::value_type>
void add_range(std::string const& name, U(Meta::*mp), F&& value_parser) {
req_.emplace_back(
std::make_unique<detail::metadata_requirement_range<Meta, T, U>>(
name, mp, std::forward<F>(value_parser)));
}
template <typename T, typename U>
void add_range(std::string const& name, U(Meta::*mp)) {
add_range(name, mp, detail::value_parser<T>);
}
void parse(folly::dynamic req) const {
for (auto const& r : req_) {
r->parse(req);
}
detail::check_unsupported_metadata_requirements(req);
}
void check(Meta const& meta) const {
for (auto const& r : req_) {
r->check(meta);
}
}
private:
std::vector<std::unique_ptr<detail::checked_metadata_requirement_base<Meta>>>
req_;
};
template <>
class compression_metadata_requirements<void> {
public:
void parse(folly::dynamic req) const {
detail::check_unsupported_metadata_requirements(req);
}
};
} // namespace dwarfs

View File

@ -26,10 +26,13 @@
#include <fmt/format.h>
#include <folly/String.h>
#include <folly/container/Enumerate.h>
#include <folly/json.h>
#include "dwarfs/categorizer.h"
#include "dwarfs/compiler.h"
#include "dwarfs/compression_metadata_requirements.h"
#include "dwarfs/error.h"
#include "dwarfs/logger.h"
@ -45,9 +48,21 @@ constexpr std::string_view const DEFAULT_CATEGORY{"<default>"};
}
std::string
categorizer::category_metadata(std::string_view, fragment_category) const {
return std::string();
}
void categorizer::set_metadata_requirements(std::string_view,
std::string requirements) {
if (!requirements.empty()) {
compression_metadata_requirements().parse(folly::parseJson(requirements));
}
}
class categorizer_manager_private : public categorizer_manager::impl {
public:
virtual std::vector<std::shared_ptr<categorizer const>> const&
virtual std::vector<std::shared_ptr<categorizer>> const&
categorizers() const = 0;
virtual fragment_category::value_type
category(std::string_view cat) const = 0;
@ -100,7 +115,7 @@ void categorizer_job_<LoggerPolicy>::categorize_random_access(
bool global_best = true;
for (auto&& [index, cat] : folly::enumerate(mgr_.categorizers())) {
if (auto p = dynamic_cast<random_access_categorizer const*>(cat.get())) {
if (auto p = dynamic_cast<random_access_categorizer*>(cat.get())) {
if (auto c = p->categorize(path_, data, cat_mapper_)) {
best_ = c;
index_ = index;
@ -126,7 +141,7 @@ void categorizer_job_<LoggerPolicy>::categorize_sequential(
break;
}
if (auto p = dynamic_cast<sequential_categorizer const*>(cat.get())) {
if (auto p = dynamic_cast<sequential_categorizer*>(cat.get())) {
if (auto job = p->job(path_, total_size_, cat_mapper_)) {
seq_jobs_.emplace_back(index, std::move(job));
}
@ -180,7 +195,7 @@ class categorizer_manager_ final : public categorizer_manager_private {
add_category(DEFAULT_CATEGORY, std::numeric_limits<size_t>::max());
}
void add(std::shared_ptr<categorizer const> c) override;
void add(std::shared_ptr<categorizer> c) override;
categorizer_job job(std::filesystem::path const& path) const override;
std::string_view
category_name(fragment_category::value_type c) const override;
@ -194,12 +209,12 @@ class categorizer_manager_ final : public categorizer_manager_private {
return rv;
}
folly::dynamic category_metadata(fragment_category c) const override;
std::string category_metadata(fragment_category c) const override;
folly::dynamic
category_metadata_sample(fragment_category::value_type c) const override;
void set_metadata_requirements(fragment_category::value_type c,
std::string req) override;
std::vector<std::shared_ptr<categorizer const>> const&
std::vector<std::shared_ptr<categorizer>> const&
categorizers() const override {
return categorizers_;
}
@ -211,8 +226,6 @@ class categorizer_manager_ final : public categorizer_manager_private {
}
private:
folly::dynamic category_metadata_impl(fragment_category c, bool sample) const;
void add_category(std::string_view cat, size_t categorizer_index) {
if (catmap_.emplace(cat, categories_.size()).second) {
categories_.emplace_back(cat, categorizer_index);
@ -223,7 +236,7 @@ class categorizer_manager_ final : public categorizer_manager_private {
logger& lgr_;
LOG_PROXY_DECL(LoggerPolicy);
std::vector<std::shared_ptr<categorizer const>> categorizers_;
std::vector<std::shared_ptr<categorizer>> categorizers_;
// TODO: category descriptions?
std::vector<std::pair<std::string_view, size_t>> categories_;
std::unordered_map<std::string_view, fragment_category::value_type> catmap_;
@ -234,8 +247,7 @@ fragment_category categorizer_manager::default_category() {
}
template <typename LoggerPolicy>
void categorizer_manager_<LoggerPolicy>::add(
std::shared_ptr<categorizer const> c) {
void categorizer_manager_<LoggerPolicy>::add(std::shared_ptr<categorizer> c) {
for (auto const& c : c->categories()) {
add_category(c, categorizers_.size());
}
@ -258,34 +270,25 @@ std::string_view categorizer_manager_<LoggerPolicy>::category_name(
}
template <typename LoggerPolicy>
folly::dynamic
categorizer_manager_<LoggerPolicy>::category_metadata_impl(fragment_category c,
bool sample) const {
std::string categorizer_manager_<LoggerPolicy>::category_metadata(
fragment_category c) const {
if (c.value() == 0) {
return folly::dynamic();
return std::string();
}
auto cat = DWARFS_NOTHROW(categories_.at(c.value()));
auto categorizer = DWARFS_NOTHROW(categorizers_.at(cat.second));
std::optional<fragment_category> maybe_category;
if (!sample) {
maybe_category.emplace(c);
}
return categorizer->category_metadata(cat.first, maybe_category);
return categorizer->category_metadata(cat.first, c);
}
template <typename LoggerPolicy>
folly::dynamic categorizer_manager_<LoggerPolicy>::category_metadata(
fragment_category c) const {
return category_metadata_impl(c, false);
}
void categorizer_manager_<LoggerPolicy>::set_metadata_requirements(
fragment_category::value_type c, std::string req) {
auto cat = DWARFS_NOTHROW(categories_.at(c));
auto categorizer = DWARFS_NOTHROW(categorizers_.at(cat.second));
template <typename LoggerPolicy>
folly::dynamic categorizer_manager_<LoggerPolicy>::category_metadata_sample(
fragment_category::value_type c) const {
return category_metadata_impl(fragment_category(c), true);
categorizer->set_metadata_requirements(cat.first, req);
}
categorizer_manager::categorizer_manager(logger& lgr)

View File

@ -64,12 +64,6 @@ class binary_categorizer_ final : public binary_categorizer_base {
bool is_single_fragment() const override { return false; }
folly::dynamic
category_metadata(std::string_view,
std::optional<fragment_category>) const override {
return folly::dynamic();
}
private:
LOG_PROXY_DECL(LoggerPolicy);
};

View File

@ -166,12 +166,6 @@ class incompressible_categorizer_ final : public sequential_categorizer {
bool is_single_fragment() const override { return true; }
folly::dynamic
category_metadata(std::string_view,
std::optional<fragment_category>) const override {
return folly::dynamic();
}
private:
logger& lgr_;
incompressible_categorizer_config const config_;

View File

@ -149,12 +149,6 @@ class libmagic_categorizer_ final : public libmagic_categorizer_base {
bool is_single_fragment() const override { return true; }
folly::dynamic
category_metadata(std::string_view,
std::optional<fragment_category>) const override {
return folly::dynamic();
}
private:
LOG_PROXY_DECL(LoggerPolicy);
magic_wrapper m_;

View File

@ -30,13 +30,17 @@
#include <boost/program_options.hpp>
#include <fmt/format.h>
#include <fmt/ostream.h>
#include <folly/Synchronized.h>
#include <folly/json.h>
#include <folly/lang/Bits.h>
#include "dwarfs/categorizer.h"
#include "dwarfs/compression_metadata_requirements.h"
#include "dwarfs/error.h"
#include "dwarfs/logger.h"
#include "dwarfs/map_util.h"
namespace dwarfs {
@ -46,7 +50,7 @@ namespace po = boost::program_options;
namespace {
constexpr std::string_view const METADATA_CATEGORY{"pcmaudio/metadata"};
constexpr std::string_view const PCMAUDIO_CATEGORY{"pcmaudio/waveform"};
constexpr std::string_view const WAVEFORM_CATEGORY{"pcmaudio/waveform"};
constexpr size_t const MIN_PCMAUDIO_SIZE{32};
@ -65,33 +69,97 @@ enum class padding : uint8_t {
MSB,
};
char const* endianness_string(endianness e) {
std::ostream& operator<<(std::ostream& os, endianness e) {
switch (e) {
case endianness::BIG:
return "big";
os << "big";
break;
case endianness::LITTLE:
return "little";
os << "little";
break;
default:
throw std::runtime_error("internal error: unhandled endianness value");
}
return os;
}
char const* signedness_string(signedness s) {
switch (s) {
std::optional<endianness> parse_endianness(std::string_view e) {
static std::unordered_map<std::string_view, endianness> const lookup{
{"big", endianness::BIG},
{"little", endianness::LITTLE},
};
return get_optional(lookup, e);
}
std::optional<endianness> parse_endianness_dyn(folly::dynamic const& e) {
return parse_endianness(e.asString());
}
std::ostream& operator<<(std::ostream& os, signedness e) {
switch (e) {
case signedness::SIGNED:
return "signed";
os << "signed";
break;
case signedness::UNSIGNED:
return "unsigned";
os << "unsigned";
break;
default:
throw std::runtime_error("internal error: unhandled signedness value");
}
return os;
}
char const* padding_string(padding p) {
switch (p) {
case padding::LSB:
return "lsb";
case padding::MSB:
return "msb";
}
std::optional<signedness> parse_signedness(std::string_view s) {
static std::unordered_map<std::string_view, signedness> const lookup{
{"signed", signedness::SIGNED},
{"unsigned", signedness::UNSIGNED},
};
return get_optional(lookup, s);
}
std::optional<signedness> parse_signedness_dyn(folly::dynamic const& s) {
return parse_signedness(s.asString());
}
std::ostream& operator<<(std::ostream& os, padding e) {
switch (e) {
case padding::LSB:
os << "lsb";
break;
case padding::MSB:
os << "msb";
break;
default:
throw std::runtime_error("internal error: unhandled padding value");
}
return os;
}
std::optional<padding> parse_padding(std::string_view p) {
static std::unordered_map<std::string_view, padding> const lookup{
{"lsb", padding::LSB},
{"msb", padding::MSB},
};
return get_optional(lookup, p);
}
std::optional<padding> parse_padding_dyn(folly::dynamic const& p) {
return parse_padding(p.asString());
}
} // namespace
} // namespace dwarfs
template <>
struct fmt::formatter<dwarfs::endianness> : ostream_formatter {};
template <>
struct fmt::formatter<dwarfs::signedness> : ostream_formatter {};
template <>
struct fmt::formatter<dwarfs::padding> : ostream_formatter {};
namespace dwarfs {
namespace {
struct pcmaudio_metadata {
endianness sample_endianness;
signedness sample_signedness;
@ -325,9 +393,8 @@ class iff_parser final {
};
std::ostream& operator<<(std::ostream& os, pcmaudio_metadata const& m) {
os << "[" << endianness_string(m.sample_endianness) << ", "
<< signedness_string(m.sample_signedness) << ", "
<< padding_string(m.sample_padding) << ", "
os << "[" << m.sample_endianness << ", " << m.sample_signedness << ", "
<< m.sample_padding << ", "
<< "bits=" << static_cast<int>(m.bits_per_sample) << ", "
<< "bytes=" << static_cast<int>(m.bytes_per_sample) << ", "
<< "channels=" << static_cast<int>(m.number_of_channels) << "]";
@ -349,27 +416,16 @@ class pcmaudio_metadata_store {
return it->second;
}
folly::dynamic lookup(size_t ix) const {
std::string lookup(size_t ix) const {
auto const& m = DWARFS_NOTHROW(forward_index_.at(ix));
folly::dynamic obj = folly::dynamic::object;
obj.insert("endianness", endianness_string(m.sample_endianness));
obj.insert("signedness", signedness_string(m.sample_signedness));
obj.insert("padding", padding_string(m.sample_padding));
obj.insert("endianness", fmt::format("{}", m.sample_endianness));
obj.insert("signedness", fmt::format("{}", m.sample_signedness));
obj.insert("padding", fmt::format("{}", m.sample_padding));
obj.insert("bytes_per_sample", m.bytes_per_sample);
obj.insert("bits_per_sample", m.bits_per_sample);
obj.insert("number_of_channels", m.number_of_channels);
return obj;
}
static folly::dynamic sample() {
folly::dynamic obj = folly::dynamic::object;
obj.insert("endianness", endianness_string(endianness::BIG));
obj.insert("signedness", signedness_string(signedness::SIGNED));
obj.insert("padding", padding_string(padding::LSB));
obj.insert("bytes_per_sample", 2);
obj.insert("bits_per_sample", 16);
obj.insert("number_of_channels", 2);
return obj;
return folly::toJson(obj);
}
private:
@ -386,7 +442,20 @@ template <typename LoggerPolicy>
class pcmaudio_categorizer_ final : public pcmaudio_categorizer_base {
public:
pcmaudio_categorizer_(logger& lgr)
: LOG_PROXY_INIT(lgr) {}
: LOG_PROXY_INIT(lgr) {
waveform_req_.add_set("endianness", &pcmaudio_metadata::sample_endianness,
parse_endianness_dyn);
waveform_req_.add_set("signedness", &pcmaudio_metadata::sample_signedness,
parse_signedness_dyn);
waveform_req_.add_set("padding", &pcmaudio_metadata::sample_padding,
parse_padding_dyn);
waveform_req_.add_range<int>("bytes_per_sample",
&pcmaudio_metadata::bytes_per_sample);
waveform_req_.add_range<int>("bits_per_sample",
&pcmaudio_metadata::bits_per_sample);
waveform_req_.add_range<int>("number_of_channels",
&pcmaudio_metadata::number_of_channels);
}
inode_fragments
categorize(fs::path const& path, std::span<uint8_t const> data,
@ -394,21 +463,19 @@ class pcmaudio_categorizer_ final : public pcmaudio_categorizer_base {
bool is_single_fragment() const override { return false; }
folly::dynamic
category_metadata(std::string_view category_name,
std::optional<fragment_category> c) const override {
if (category_name == PCMAUDIO_CATEGORY) {
if (c) {
DWARFS_CHECK(c->has_subcategory(),
"expected PCMAUDIO to have subcategory");
return meta_.rlock()->lookup(c->subcategory());
} else {
return pcmaudio_metadata_store::sample();
}
std::string category_metadata(std::string_view category_name,
fragment_category c) const override {
if (category_name == WAVEFORM_CATEGORY) {
DWARFS_CHECK(c.has_subcategory(),
"expected PCMAUDIO to have subcategory");
return meta_.rlock()->lookup(c.subcategory());
}
return folly::dynamic();
return std::string();
}
void set_metadata_requirements(std::string_view category_name,
std::string requirements) override;
private:
bool check_aiff(inode_fragments& frag, fs::path const& path,
std::span<uint8_t const> data,
@ -428,15 +495,20 @@ class pcmaudio_categorizer_ final : public pcmaudio_categorizer_base {
std::span<uint8_t const> data,
category_mapper const& mapper) const;
bool check_metadata_requirements(pcmaudio_metadata const& meta,
std::string_view context,
fs::path const& path) const;
LOG_PROXY_DECL(LoggerPolicy);
folly::Synchronized<pcmaudio_metadata_store> mutable meta_;
compression_metadata_requirements<pcmaudio_metadata> waveform_req_;
};
std::span<std::string_view const>
pcmaudio_categorizer_base::categories() const {
static constexpr std::array const s_categories{
METADATA_CATEGORY,
PCMAUDIO_CATEGORY,
WAVEFORM_CATEGORY,
};
return s_categories;
}
@ -517,6 +589,10 @@ bool pcmaudio_categorizer_<LoggerPolicy>::check_aiff(
return false;
}
if (!check_metadata_requirements(meta, "AIFF", path)) {
return false;
}
meta_valid = true;
LOG_TRACE << "[AIFF] " << path << ": meta=" << meta;
@ -553,7 +629,7 @@ bool pcmaudio_categorizer_<LoggerPolicy>::check_aiff(
frag.emplace_back(fragment_category(mapper(METADATA_CATEGORY)),
pcm_start);
frag.emplace_back(
fragment_category(mapper(PCMAUDIO_CATEGORY), subcategory),
fragment_category(mapper(WAVEFORM_CATEGORY), subcategory),
pcm_length);
if (pcm_start + pcm_length < data.size()) {
@ -710,6 +786,10 @@ bool pcmaudio_categorizer_<LoggerPolicy>::check_caf(
return false;
}
if (!check_metadata_requirements(meta, "CAF", path)) {
return false;
}
meta_valid = true;
LOG_TRACE << "[CAF] " << path << ": meta=" << meta;
@ -736,7 +816,7 @@ bool pcmaudio_categorizer_<LoggerPolicy>::check_caf(
frag.emplace_back(fragment_category(mapper(METADATA_CATEGORY)),
pcm_start);
frag.emplace_back(
fragment_category(mapper(PCMAUDIO_CATEGORY), subcategory),
fragment_category(mapper(WAVEFORM_CATEGORY), subcategory),
pcm_length);
if (pcm_start + pcm_length < data.size()) {
@ -885,6 +965,10 @@ bool pcmaudio_categorizer_<LoggerPolicy>::check_wav_like(
return false;
}
if (!check_metadata_requirements(meta, FormatPolicy::format_name, path)) {
return false;
}
meta_valid = true;
LOG_TRACE << "[" << FormatPolicy::format_name << "] " << path
@ -912,7 +996,7 @@ bool pcmaudio_categorizer_<LoggerPolicy>::check_wav_like(
frag.emplace_back(fragment_category(mapper(METADATA_CATEGORY)),
pcm_start);
frag.emplace_back(
fragment_category(mapper(PCMAUDIO_CATEGORY), subcategory),
fragment_category(mapper(WAVEFORM_CATEGORY), subcategory),
pcm_length);
if (pcm_start + pcm_length < data.size()) {
@ -927,6 +1011,20 @@ bool pcmaudio_categorizer_<LoggerPolicy>::check_wav_like(
return false;
}
template <typename LoggerPolicy>
bool pcmaudio_categorizer_<LoggerPolicy>::check_metadata_requirements(
pcmaudio_metadata const& meta, std::string_view context,
fs::path const& path) const {
try {
waveform_req_.check(meta);
} catch (std::exception const& e) {
LOG_WARN << "[" << context << "] " << path << ": " << e.what();
return false;
}
return true;
}
template <typename LoggerPolicy>
inode_fragments pcmaudio_categorizer_<LoggerPolicy>::categorize(
fs::path const& path, std::span<uint8_t const> data,
@ -954,6 +1052,19 @@ inode_fragments pcmaudio_categorizer_<LoggerPolicy>::categorize(
return fragments;
}
template <typename LoggerPolicy>
void pcmaudio_categorizer_<LoggerPolicy>::set_metadata_requirements(
std::string_view category_name, std::string requirements) {
if (!requirements.empty()) {
auto req = folly::parseJson(requirements);
if (category_name == WAVEFORM_CATEGORY) {
waveform_req_.parse(req);
} else {
compression_metadata_requirements().parse(req);
}
}
}
class pcmaudio_categorizer_factory : public categorizer_factory {
public:
std::string_view name() const override { return "pcmaudio"; }

View File

@ -21,6 +21,8 @@
#include <fmt/format.h>
#include <folly/String.h>
#include "dwarfs/categorizer.h"
#include "dwarfs/category_parser.h"

View File

@ -49,8 +49,9 @@ class brotli_block_compressor final : public block_compressor::impl {
return std::make_unique<brotli_block_compressor>(*this);
}
std::vector<uint8_t> compress(const std::vector<uint8_t>& data,
folly::dynamic /*meta*/) const override {
std::vector<uint8_t>
compress(const std::vector<uint8_t>& data,
std::string const* /*metadata*/) const override {
std::vector<uint8_t> compressed;
compressed.resize(folly::kMaxVarintLength64 +
::BrotliEncoderMaxCompressedSize(data.size()));
@ -69,9 +70,9 @@ class brotli_block_compressor final : public block_compressor::impl {
return compressed;
}
std::vector<uint8_t>
compress(std::vector<uint8_t>&& data, folly::dynamic meta) const override {
return compress(data, std::move(meta));
std::vector<uint8_t> compress(std::vector<uint8_t>&& data,
std::string const* metadata) const override {
return compress(data, metadata);
}
compression_type type() const override { return compression_type::BROTLI; }
@ -80,7 +81,7 @@ class brotli_block_compressor final : public block_compressor::impl {
return fmt::format("brotli [quality={}, lgwin={}]", quality_, window_bits_);
}
bool check_metadata(folly::dynamic /*meta*/) const override { return true; }
std::string metadata_requirements() const override { return std::string(); }
private:
uint32_t const quality_;

View File

@ -31,6 +31,7 @@
#include <fmt/format.h>
#include <folly/Varint.h>
#include <folly/json.h>
#include "dwarfs/block_compressor.h"
#include "dwarfs/compression.h"
@ -204,7 +205,14 @@ class flac_block_compressor final : public block_compressor::impl {
}
std::vector<uint8_t> compress(const std::vector<uint8_t>& data,
folly::dynamic meta) const override {
std::string const* metadata) const override {
if (!metadata) {
DWARFS_THROW(runtime_error,
"internal error: flac compression requires metadata");
}
auto meta = folly::parseJson(*metadata);
auto endianness = meta["endianness"].asString();
auto signedness = meta["signedness"].asString();
auto padding = meta["padding"].asString();
@ -332,9 +340,9 @@ class flac_block_compressor final : public block_compressor::impl {
return compressed;
}
std::vector<uint8_t>
compress(std::vector<uint8_t>&& data, folly::dynamic meta) const override {
return compress(data, std::move(meta));
std::vector<uint8_t> compress(std::vector<uint8_t>&& data,
std::string const* metadata) const override {
return compress(data, metadata);
}
compression_type type() const override { return compression_type::FLAC; }
@ -344,15 +352,20 @@ class flac_block_compressor final : public block_compressor::impl {
exhaustive_ ? ", exhaustive" : "");
}
bool check_metadata(folly::dynamic meta) const override {
if (meta.empty()) {
return false;
}
return meta.count("endianness") > 0 && meta.count("signedness") > 0 &&
meta.count("padding") > 0 && meta.count("bytes_per_sample") > 0 &&
meta.count("bits_per_sample") > 0 &&
meta.count("number_of_channels") > 0;
std::string metadata_requirements() const override {
folly::dynamic req = folly::dynamic::object
// clang-format off
("endianness", folly::dynamic::array("set",
folly::dynamic::array("big", "little")))
("signedness", folly::dynamic::array("set",
folly::dynamic::array("signed", "unsigned")))
("padding", folly::dynamic::array("set",
folly::dynamic::array("msb", "lsb")))
("bytes_per_sample", folly::dynamic::array("range", 1, 4))
("bits_per_sample", folly::dynamic::array("range", 8, 32))
("number_of_channels", folly::dynamic::array("range", 1, 8))
; // clang-format on
return folly::toJson(req);
}
private:

View File

@ -66,8 +66,9 @@ class lz4_block_compressor final : public block_compressor::impl {
return std::make_unique<lz4_block_compressor>(*this);
}
std::vector<uint8_t> compress(const std::vector<uint8_t>& data,
folly::dynamic /*meta*/) const override {
std::vector<uint8_t>
compress(const std::vector<uint8_t>& data,
std::string const* /*metadata*/) const override {
std::vector<uint8_t> compressed(
sizeof(uint32_t) + LZ4_compressBound(folly::to<int>(data.size())));
*reinterpret_cast<uint32_t*>(&compressed[0]) = data.size();
@ -84,16 +85,16 @@ class lz4_block_compressor final : public block_compressor::impl {
return compressed;
}
std::vector<uint8_t>
compress(std::vector<uint8_t>&& data, folly::dynamic meta) const override {
return compress(data, std::move(meta));
std::vector<uint8_t> compress(std::vector<uint8_t>&& data,
std::string const* metadata) const override {
return compress(data, metadata);
}
compression_type type() const override { return compression_type::LZ4; }
std::string describe() const override { return Policy::describe(level_); }
bool check_metadata(folly::dynamic /*meta*/) const override { return true; }
std::string metadata_requirements() const override { return std::string(); }
private:
const int level_;

View File

@ -64,17 +64,17 @@ class lzma_block_compressor final : public block_compressor::impl {
}
std::vector<uint8_t> compress(const std::vector<uint8_t>& data,
folly::dynamic meta) const override;
std::vector<uint8_t>
compress(std::vector<uint8_t>&& data, folly::dynamic meta) const override {
return compress(data, std::move(meta));
std::string const* metadata) const override;
std::vector<uint8_t> compress(std::vector<uint8_t>&& data,
std::string const* metadata) const override {
return compress(data, metadata);
}
compression_type type() const override { return compression_type::LZMA; }
std::string describe() const override { return description_; }
bool check_metadata(folly::dynamic /*meta*/) const override { return true; }
std::string metadata_requirements() const override { return std::string(); }
private:
std::vector<uint8_t>
@ -178,7 +178,7 @@ lzma_block_compressor::compress(const std::vector<uint8_t>& data,
std::vector<uint8_t>
lzma_block_compressor::compress(const std::vector<uint8_t>& data,
folly::dynamic /*meta*/) const {
std::string const* /*metadata*/) const {
std::vector<uint8_t> best = compress(data, &filters_[1]);
if (filters_[0].id != LZMA_VLI_UNKNOWN) {

View File

@ -37,13 +37,15 @@ class null_block_compressor final : public block_compressor::impl {
return std::make_unique<null_block_compressor>(*this);
}
std::vector<uint8_t> compress(const std::vector<uint8_t>& data,
folly::dynamic /*meta*/) const override {
std::vector<uint8_t>
compress(const std::vector<uint8_t>& data,
std::string const* /*metadata*/) const override {
return data;
}
std::vector<uint8_t> compress(std::vector<uint8_t>&& data,
folly::dynamic /*meta*/) const override {
std::vector<uint8_t>
compress(std::vector<uint8_t>&& data,
std::string const* /*metadata*/) const override {
return std::move(data);
}
@ -51,7 +53,7 @@ class null_block_compressor final : public block_compressor::impl {
std::string describe() const override { return "null"; }
bool check_metadata(folly::dynamic /*meta*/) const override { return true; }
std::string metadata_requirements() const override { return std::string(); }
};
class null_block_decompressor final : public block_decompressor::impl {

View File

@ -55,11 +55,11 @@ class zstd_block_compressor final : public block_compressor::impl {
}
std::vector<uint8_t> compress(const std::vector<uint8_t>& data,
folly::dynamic meta) const override;
std::string const* metadata) const override;
std::vector<uint8_t>
compress(std::vector<uint8_t>&& data, folly::dynamic meta) const override {
return compress(data, std::move(meta));
std::vector<uint8_t> compress(std::vector<uint8_t>&& data,
std::string const* metadata) const override {
return compress(data, std::move(metadata));
}
compression_type type() const override { return compression_type::ZSTD; }
@ -68,7 +68,7 @@ class zstd_block_compressor final : public block_compressor::impl {
return fmt::format("zstd [level={}]", level_);
}
bool check_metadata(folly::dynamic /*meta*/) const override { return true; }
std::string metadata_requirements() const override { return std::string(); }
private:
class scoped_context;
@ -147,7 +147,7 @@ std::weak_ptr<zstd_block_compressor::context_manager>
std::vector<uint8_t>
zstd_block_compressor::compress(const std::vector<uint8_t>& data,
folly::dynamic /*meta*/) const {
std::string const* /*metadata*/) const {
std::vector<uint8_t> compressed(ZSTD_compressBound(data.size()));
scoped_context ctx(*ctxmgr_);
auto size = ZSTD_compressCCtx(ctx.get(), compressed.data(), compressed.size(),

View File

@ -0,0 +1,63 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/
#include <algorithm>
#include "dwarfs/compression_metadata_requirements.h"
namespace dwarfs::detail {
void check_dynamic_common(folly::dynamic const& dyn,
std::string_view expected_type, size_t expected_size,
std::string_view name) {
if (dyn.type() != folly::dynamic::ARRAY) {
throw std::runtime_error(
fmt::format("found non-array type for requirement '{}'", name));
}
if (dyn.empty()) {
throw std::runtime_error(
fmt::format("unexpected empty value for requirement '{}'", name));
}
if (auto type = dyn[0].asString(); type != expected_type) {
throw std::runtime_error(
fmt::format("invalid type '{}' for requirement '{}', expected '{}'",
type, name, expected_type));
}
if (dyn.size() != expected_size) {
throw std::runtime_error(
fmt::format("unexpected size '{}' for requirement '{}', expected {}",
dyn.size(), name, expected_size));
}
}
void check_unsupported_metadata_requirements(folly::dynamic& req) {
if (!req.empty()) {
std::vector<std::string> keys;
for (auto k : req.keys()) {
keys.emplace_back(k.asString());
}
std::sort(keys.begin(), keys.end());
throw std::runtime_error(fmt::format(
"unsupported metadata requirements: {}", folly::join(", ", keys)));
}
}
} // namespace dwarfs::detail

View File

@ -102,8 +102,7 @@ class raw_fsblock : public fsblock::impl {
wg.add_job([this, prom = std::move(prom)]() mutable {
try {
// TODO: metadata
auto tmp = std::make_shared<block_data>(
bc_.compress(data_->vec(), folly::dynamic()));
auto tmp = std::make_shared<block_data>(bc_.compress(data_->vec()));
{
std::lock_guard lock(mx_);

View File

@ -1025,11 +1025,13 @@ int mkdwarfs_main(int argc, sys_char** argv) {
compression_opt.visit_contextual([catmgr = options.inode.categorizer_mgr](
auto cat, block_compressor const& bc) {
if (!bc.check_metadata(catmgr->category_metadata_sample(cat))) {
try {
catmgr->set_metadata_requirements(cat, bc.metadata_requirements());
} catch (std::exception const& e) {
throw std::runtime_error(
fmt::format("compression '{}' cannot be used for category '{}': "
"insufficient metadata",
bc.describe(), catmgr->category_name(cat)));
"metadata requirements not met ({})",
bc.describe(), catmgr->category_name(cat), e.what()));
}
});
} catch (std::exception const& e) {

View File

@ -24,6 +24,8 @@
#include <gtest/gtest.h>
#include <folly/json.h>
#include "dwarfs/block_compressor.h"
#include "dwarfs/pcm_sample_transformer.h"
@ -148,7 +150,7 @@ TEST(flac_compressor, basic) {
block_compressor comp("flac");
auto compressed = comp.compress(data, std::move(meta));
auto compressed = comp.compress(data, folly::toJson(meta));
EXPECT_LT(compressed.size(), data.size() / 2);
@ -181,7 +183,7 @@ TEST_P(flac_param, combinations) {
block_compressor comp("flac");
auto compressed = comp.compress(data, std::move(meta));
auto compressed = comp.compress(data, folly::toJson(meta));
EXPECT_LT(compressed.size(), data.size() / 2);