Category-dependent block compression

This commit is contained in:
Marcus Holland-Moritz 2023-07-22 17:40:22 +02:00
parent 92226a73bd
commit 3f0d7c14fd
14 changed files with 214 additions and 83 deletions

View File

@ -354,6 +354,7 @@ list(
LIBDWARFS_SRC
src/dwarfs/block_cache.cpp
src/dwarfs/block_compressor.cpp
src/dwarfs/block_compressor_parser.cpp
src/dwarfs/block_manager.cpp
src/dwarfs/block_range.cpp
src/dwarfs/builtin_script.cpp

View File

@ -65,6 +65,8 @@ class block_compressor {
compression_type type() const { return impl_->type(); }
std::string describe() const { return impl_->describe(); }
class impl {
public:
virtual ~impl() = default;
@ -77,6 +79,7 @@ class block_compressor {
compress(std::vector<uint8_t>&& data) const = 0;
virtual compression_type type() const = 0;
virtual std::string describe() const = 0;
};
private:

View File

@ -0,0 +1,37 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/
#pragma once
#include <string>
#include <string_view>
#include "dwarfs/block_compressor.h"
namespace dwarfs {
struct block_compressor_parser {
public:
block_compressor parse(std::string_view arg) const;
std::string to_string(block_compressor const& opts) const;
};
} // namespace dwarfs

View File

@ -41,22 +41,22 @@ class contextual_option {
using policy_type = Policy;
using context_argument_type = typename policy_type::ContextArgumentType;
using context_type = typename policy_type::ContextType;
using option_type = typename policy_type::OptionType;
using value_type = typename policy_type::ValueType;
template <typename OptionType, typename ContextParser, typename OptionParser>
friend class contextual_option_parser;
contextual_option() = default;
explicit contextual_option(option_type const& def)
explicit contextual_option(value_type const& def)
: default_{def} {}
void set_default(option_type const& val) { default_ = val; }
void set_default(value_type const& val) { default_ = val; }
void add_contextual(context_type const& ctx, option_type const& val) {
contextual_[ctx] = val;
bool add_contextual(context_type const& ctx, value_type const& val) {
return contextual_.emplace(ctx, val).second;
}
std::optional<option_type>
std::optional<value_type>
get_optional(context_argument_type const& arg) const {
if constexpr (std::is_same_v<context_type, context_argument_type>) {
return get_optional_impl(arg);
@ -65,7 +65,7 @@ class contextual_option {
}
}
option_type get(context_argument_type const& arg) const {
value_type get(context_argument_type const& arg) const {
if constexpr (std::is_same_v<context_type, context_argument_type>) {
return get_impl(arg);
} else {
@ -73,9 +73,9 @@ class contextual_option {
}
}
std::optional<option_type> get_optional() const { return default_; }
std::optional<value_type> get_optional() const { return default_; }
option_type get() const { return default_.value(); }
value_type get() const { return default_.value(); }
template <typename T>
bool any_is(T&& pred) const {
@ -88,22 +88,22 @@ class contextual_option {
}
private:
std::optional<option_type> get_optional_impl(context_type const& ctx) const {
std::optional<value_type> get_optional_impl(context_type const& ctx) const {
if (auto it = contextual_.find(ctx); it != contextual_.end()) {
return it->second;
}
return default_;
}
option_type get_impl(context_type const& ctx) const {
value_type get_impl(context_type const& ctx) const {
if (auto it = contextual_.find(ctx); it != contextual_.end()) {
return it->second;
}
return default_.value();
}
std::optional<option_type> default_;
std::unordered_map<context_type, option_type> contextual_;
std::optional<value_type> default_;
std::unordered_map<context_type, value_type> contextual_;
};
template <typename OptionType, typename ContextParser, typename OptionParser>
@ -132,10 +132,10 @@ class contextual_option_parser {
std::invoke_result_t<decltype(&ContextParser::parse),
ContextParser, decltype(ctx)>,
typename option_type::context_type>) {
opt_.add_contextual(cp_.parse(ctx), val);
add_contextual(cp_.parse(ctx), val);
} else {
for (auto c : cp_.parse(ctx)) {
opt_.add_contextual(c, val);
add_contextual(c, val);
}
}
}
@ -172,6 +172,14 @@ class contextual_option_parser {
}
private:
void add_contextual(typename option_type::context_type const& ctx,
typename option_type::value_type const& val) const {
if (!opt_.add_contextual(ctx, val)) {
throw std::runtime_error(fmt::format(
"duplicate context '{}' for option '{}'", cp_.to_string(ctx), name_));
}
}
OptionType& opt_;
ContextParser const& cp_;
OptionParser const& op_;

View File

@ -21,6 +21,7 @@
#pragma once
#include <string>
#include <string_view>
#include "dwarfs/options.h"

View File

@ -44,7 +44,7 @@ template <typename T>
struct categorized_option_policy {
using ContextArgumentType = fragment_category;
using ContextType = fragment_category::value_type;
using OptionType = T;
using ValueType = T;
static ContextType context_from_arg(ContextArgumentType const& arg) {
return arg.value();
@ -53,9 +53,9 @@ struct categorized_option_policy {
} // namespace detail
template <typename OptionType>
template <typename ValueType>
using categorized_option =
contextual_option<detail::categorized_option_policy<OptionType>>;
contextual_option<detail::categorized_option_policy<ValueType>>;
enum class mlock_mode { NONE, TRY, MUST };

View File

@ -0,0 +1,35 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/
#include "dwarfs/block_compressor_parser.h"
namespace dwarfs {
block_compressor block_compressor_parser::parse(std::string_view arg) const {
return block_compressor(std::string(arg));
}
std::string
block_compressor_parser::to_string(block_compressor const& bc) const {
return bc.describe();
}
} // namespace dwarfs

View File

@ -33,7 +33,7 @@ std::vector<fragment_category::value_type>
category_parser::parse(std::string_view arg) const {
if (!catmgr_) {
throw std::runtime_error(
"cannot configure category-specific options without any categories");
"cannot configure category-specific options without any categorizers");
}
std::vector<fragment_category::value_type> rv;

View File

@ -75,6 +75,10 @@ class brotli_block_compressor final : public block_compressor::impl {
compression_type type() const override { return compression_type::BROTLI; }
std::string describe() const override {
return fmt::format("brotli [quality={}, lgwin={}]", quality_, window_bits_);
}
private:
uint32_t const quality_;
uint32_t const window_bits_;

View File

@ -38,6 +38,8 @@ struct lz4_compression_policy {
reinterpret_cast<const char*>(src), reinterpret_cast<char*>(dest),
folly::to<int>(size), folly::to<int>(destsize)));
}
static std::string describe(int /*level*/) { return "lz4"; }
};
struct lz4hc_compression_policy {
@ -47,6 +49,10 @@ struct lz4hc_compression_policy {
reinterpret_cast<const char*>(src), reinterpret_cast<char*>(dest),
folly::to<int>(size), folly::to<int>(destsize), level));
}
static std::string describe(int level) {
return fmt::format("lz4hc [level={}]", level);
}
};
template <typename Policy>
@ -84,6 +90,8 @@ class lz4_block_compressor final : public block_compressor::impl {
compression_type type() const override { return compression_type::LZ4; }
std::string describe() const override { return Policy::describe(level_); }
private:
const int level_;
};

View File

@ -71,6 +71,8 @@ class lzma_block_compressor final : public block_compressor::impl {
compression_type type() const override { return compression_type::LZMA; }
std::string describe() const override { return description_; }
private:
std::vector<uint8_t>
compress(const std::vector<uint8_t>& data, const lzma_filter* filters) const;
@ -107,11 +109,16 @@ class lzma_block_compressor final : public block_compressor::impl {
lzma_options_lzma opt_lzma_;
std::array<lzma_filter, 3> filters_;
std::string description_;
};
lzma_block_compressor::lzma_block_compressor(unsigned level, bool extreme,
const std::string& binary_mode,
unsigned dict_size) {
unsigned dict_size)
: description_{
fmt::format("lzma [level={}, dict_size={}{}{}]", level, dict_size,
extreme ? ", extreme" : "",
binary_mode.empty() ? "" : ", binary=" + binary_mode)} {
if (lzma_lzma_preset(&opt_lzma_, get_preset(level, extreme))) {
DWARFS_THROW(runtime_error, "unsupported preset, possibly a bug");
}

View File

@ -47,6 +47,8 @@ class null_block_compressor final : public block_compressor::impl {
}
compression_type type() const override { return compression_type::NONE; }
std::string describe() const override { return "null"; }
};
class null_block_decompressor final : public block_decompressor::impl {

View File

@ -63,6 +63,10 @@ class zstd_block_compressor final : public block_compressor::impl {
compression_type type() const override { return compression_type::ZSTD; }
std::string describe() const override {
return fmt::format("zstd [level={}]", level_);
}
private:
class scoped_context;

View File

@ -51,6 +51,7 @@
#include <fmt/format.h>
#include "dwarfs/block_compressor.h"
#include "dwarfs/block_compressor_parser.h"
#include "dwarfs/block_manager.h"
#include "dwarfs/builtin_script.h"
#include "dwarfs/categorizer.h"
@ -275,13 +276,14 @@ int mkdwarfs_main(int argc, sys_char** argv) {
block_manager::config cfg;
sys_string path_str, output_str;
std::string memory_limit, script_arg, compression, header, schema_compression,
std::string memory_limit, script_arg, header, schema_compression,
metadata_compression, log_level_str, timestamp, time_resolution,
progress_mode, recompress_opts, pack_metadata, file_hash_algo,
debug_filter, max_similarity_size, input_list_str, chmod_str,
categorizer_list_str;
std::vector<sys_string> filter;
std::vector<std::string> order, max_lookback_blocks, window_size, window_step;
std::vector<std::string> order, max_lookback_blocks, window_size, window_step,
compression;
size_t num_workers, num_scanner_workers;
bool no_progress = false, remove_header = false, no_section_index = false,
force_overwrite = false;
@ -292,11 +294,12 @@ int mkdwarfs_main(int argc, sys_char** argv) {
integral_value_parser<size_t> max_lookback_parser;
integral_value_parser<unsigned> window_size_parser(6, 24);
integral_value_parser<unsigned> window_step_parser(0, 8);
fragment_order_parser order_parser;
block_compressor_parser compressor_parser;
scanner_options options;
auto order_desc =
"inode fragments order (" + fragment_order_parser::choices() + ")";
auto order_desc = "inode fragments order (" + order_parser.choices() + ")";
auto progress_desc = "progress mode (" +
(from(progress_modes) | get<0>() | unsplit(", ")) + ")";
@ -430,7 +433,8 @@ int mkdwarfs_main(int argc, sys_char** argv) {
po::options_description compressor_opts("Compressor options");
compressor_opts.add_options()
("compression,C",
po::value<std::string>(&compression),
// po::value<std::string>(&compression), // TODO
po::value<std::vector<std::string>>(&compression)->multitoken(),
"block compression algorithm")
("schema-compression",
po::value<std::string>(&schema_compression),
@ -590,7 +594,7 @@ int mkdwarfs_main(int argc, sys_char** argv) {
}
if (!vm.count("compression")) {
compression = defaults.data_compression;
compression.push_back(std::string(defaults.data_compression));
}
if (!vm.count("schema-compression")) {
@ -913,13 +917,14 @@ int mkdwarfs_main(int argc, sys_char** argv) {
progress prog(std::move(updater), interval_ms);
block_compressor bc(compression);
block_compressor bc(compression.front()); // TODO
block_compressor schema_bc(schema_compression);
block_compressor metadata_bc(metadata_compression);
auto min_memory_req = num_workers * (UINT64_C(1) << cfg.block_size_bits);
if (mem_limit < min_memory_req && compression != "null") {
// TODO:
if (mem_limit < min_memory_req /* && compression != "null" */) {
LOG_WARN << "low memory limit (" << size_with_unit(mem_limit) << "), need "
<< size_with_unit(min_memory_req) << " to efficiently compress "
<< size_with_unit(UINT64_C(1) << cfg.block_size_bits)
@ -951,6 +956,77 @@ int mkdwarfs_main(int argc, sys_char** argv) {
os = std::make_unique<std::ostringstream>();
}
// TODO: the whole re-writing thing will be a bit weird in combination
// with categories; we'd likely require a "category"-section to be
// present (which we'll also require for bit-identical mode)
if (!categorizer_list_str.empty()) {
std::vector<std::string> categorizer_list;
boost::split(categorizer_list, categorizer_list_str, boost::is_any_of(","));
options.inode.categorizer_mgr = std::make_shared<categorizer_manager>(lgr);
for (auto const& name : categorizer_list) {
options.inode.categorizer_mgr->add(catreg.create(lgr, name, vm));
}
}
category_parser cp(options.inode.categorizer_mgr);
try {
contextual_option_parser cop("--order", options.inode.fragment_order, cp,
order_parser);
cop.parse(order);
cop.dump(std::cerr);
} catch (std::exception const& e) {
LOG_ERROR << e.what();
return 1;
}
try {
categorized_option<size_t> max_lookback_opt;
contextual_option_parser cop("--max-lookback-blocks", max_lookback_opt, cp,
max_lookback_parser);
cop.parse(max_lookback_blocks);
cop.dump(std::cerr);
} catch (std::exception const& e) {
LOG_ERROR << e.what();
return 1;
}
try {
categorized_option<unsigned> window_size_opt;
contextual_option_parser cop("--window-size", window_size_opt, cp,
window_size_parser);
cop.parse(window_size);
cop.dump(std::cerr);
} catch (std::exception const& e) {
LOG_ERROR << e.what();
return 1;
}
try {
categorized_option<unsigned> window_step_opt;
contextual_option_parser cop("--window-step", window_step_opt, cp,
window_step_parser);
cop.parse(window_step);
cop.dump(std::cerr);
} catch (std::exception const& e) {
LOG_ERROR << e.what();
return 1;
}
try {
categorized_option<block_compressor> compression_opt;
contextual_option_parser cop("--compression", compression_opt, cp,
compressor_parser);
cop.parse(compression);
cop.dump(std::cerr);
} catch (std::exception const& e) {
LOG_ERROR << e.what();
return 1;
}
filesystem_writer fsw(*os, lgr, wg_compress, prog, bc, schema_bc, metadata_bc,
fswopts, header_ifs.get());
@ -962,61 +1038,6 @@ int mkdwarfs_main(int argc, sys_char** argv) {
fsw, rw_opts);
wg_compress.wait();
} else {
if (!categorizer_list_str.empty()) {
std::vector<std::string> categorizer_list;
boost::split(categorizer_list, categorizer_list_str,
boost::is_any_of(","));
options.inode.categorizer_mgr =
std::make_shared<categorizer_manager>(lgr);
for (auto const& name : categorizer_list) {
options.inode.categorizer_mgr->add(catreg.create(lgr, name, vm));
}
}
category_parser cp(options.inode.categorizer_mgr);
try {
fragment_order_parser fop;
contextual_option_parser("--order", options.inode.fragment_order, cp,
fop)
.parse(order);
} catch (std::exception const& e) {
LOG_ERROR << e.what();
return 1;
}
try {
categorized_option<size_t> max_lookback_opt;
contextual_option_parser("--max-lookback-blocks", max_lookback_opt, cp,
max_lookback_parser)
.parse(max_lookback_blocks);
} catch (std::exception const& e) {
LOG_ERROR << e.what();
return 1;
}
try {
categorized_option<unsigned> window_size_opt;
contextual_option_parser("--window-size", window_size_opt, cp,
window_size_parser)
.parse(window_size);
} catch (std::exception const& e) {
LOG_ERROR << e.what();
return 1;
}
try {
categorized_option<unsigned> window_step_opt;
contextual_option_parser("--window-step", window_step_opt, cp,
window_step_parser)
.parse(window_step);
} catch (std::exception const& e) {
LOG_ERROR << e.what();
return 1;
}
scanner s(lgr, wg_scanner, cfg, entry_factory::create(),
std::make_shared<os_access_generic>(), std::move(script),
options);