From 3f0d7c14fd4a4fa3fe13f71b53e60ce5ab3e7441 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Sat, 22 Jul 2023 17:40:22 +0200 Subject: [PATCH] Category-dependent block compression --- CMakeLists.txt | 1 + include/dwarfs/block_compressor.h | 3 + include/dwarfs/block_compressor_parser.h | 37 ++++++ include/dwarfs/contextual_option.h | 38 +++--- include/dwarfs/fragment_order_parser.h | 1 + include/dwarfs/options.h | 6 +- src/dwarfs/block_compressor_parser.cpp | 35 ++++++ src/dwarfs/category_parser.cpp | 2 +- src/dwarfs/compression/brotli.cpp | 4 + src/dwarfs/compression/lz4.cpp | 8 ++ src/dwarfs/compression/lzma.cpp | 9 +- src/dwarfs/compression/null.cpp | 2 + src/dwarfs/compression/zstd.cpp | 4 + src/mkdwarfs_main.cpp | 147 +++++++++++++---------- 14 files changed, 214 insertions(+), 83 deletions(-) create mode 100644 include/dwarfs/block_compressor_parser.h create mode 100644 src/dwarfs/block_compressor_parser.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 64e343f3..37de4867 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -354,6 +354,7 @@ list( LIBDWARFS_SRC src/dwarfs/block_cache.cpp src/dwarfs/block_compressor.cpp + src/dwarfs/block_compressor_parser.cpp src/dwarfs/block_manager.cpp src/dwarfs/block_range.cpp src/dwarfs/builtin_script.cpp diff --git a/include/dwarfs/block_compressor.h b/include/dwarfs/block_compressor.h index 8184cada..07329a05 100644 --- a/include/dwarfs/block_compressor.h +++ b/include/dwarfs/block_compressor.h @@ -65,6 +65,8 @@ class block_compressor { compression_type type() const { return impl_->type(); } + std::string describe() const { return impl_->describe(); } + class impl { public: virtual ~impl() = default; @@ -77,6 +79,7 @@ class block_compressor { compress(std::vector&& data) const = 0; virtual compression_type type() const = 0; + virtual std::string describe() const = 0; }; private: diff --git a/include/dwarfs/block_compressor_parser.h b/include/dwarfs/block_compressor_parser.h new file mode 100644 index 00000000..14d3df84 --- /dev/null +++ b/include/dwarfs/block_compressor_parser.h @@ -0,0 +1,37 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#pragma once + +#include +#include + +#include "dwarfs/block_compressor.h" + +namespace dwarfs { + +struct block_compressor_parser { + public: + block_compressor parse(std::string_view arg) const; + std::string to_string(block_compressor const& opts) const; +}; + +} // namespace dwarfs diff --git a/include/dwarfs/contextual_option.h b/include/dwarfs/contextual_option.h index c273a1dc..b3b50fd8 100644 --- a/include/dwarfs/contextual_option.h +++ b/include/dwarfs/contextual_option.h @@ -41,22 +41,22 @@ class contextual_option { using policy_type = Policy; using context_argument_type = typename policy_type::ContextArgumentType; using context_type = typename policy_type::ContextType; - using option_type = typename policy_type::OptionType; + using value_type = typename policy_type::ValueType; template friend class contextual_option_parser; contextual_option() = default; - explicit contextual_option(option_type const& def) + explicit contextual_option(value_type const& def) : default_{def} {} - void set_default(option_type const& val) { default_ = val; } + void set_default(value_type const& val) { default_ = val; } - void add_contextual(context_type const& ctx, option_type const& val) { - contextual_[ctx] = val; + bool add_contextual(context_type const& ctx, value_type const& val) { + return contextual_.emplace(ctx, val).second; } - std::optional + std::optional get_optional(context_argument_type const& arg) const { if constexpr (std::is_same_v) { return get_optional_impl(arg); @@ -65,7 +65,7 @@ class contextual_option { } } - option_type get(context_argument_type const& arg) const { + value_type get(context_argument_type const& arg) const { if constexpr (std::is_same_v) { return get_impl(arg); } else { @@ -73,9 +73,9 @@ class contextual_option { } } - std::optional get_optional() const { return default_; } + std::optional get_optional() const { return default_; } - option_type get() const { return default_.value(); } + value_type get() const { return default_.value(); } template bool any_is(T&& pred) const { @@ -88,22 +88,22 @@ class contextual_option { } private: - std::optional get_optional_impl(context_type const& ctx) const { + std::optional get_optional_impl(context_type const& ctx) const { if (auto it = contextual_.find(ctx); it != contextual_.end()) { return it->second; } return default_; } - option_type get_impl(context_type const& ctx) const { + value_type get_impl(context_type const& ctx) const { if (auto it = contextual_.find(ctx); it != contextual_.end()) { return it->second; } return default_.value(); } - std::optional default_; - std::unordered_map contextual_; + std::optional default_; + std::unordered_map contextual_; }; template @@ -132,10 +132,10 @@ class contextual_option_parser { std::invoke_result_t, typename option_type::context_type>) { - opt_.add_contextual(cp_.parse(ctx), val); + add_contextual(cp_.parse(ctx), val); } else { for (auto c : cp_.parse(ctx)) { - opt_.add_contextual(c, val); + add_contextual(c, val); } } } @@ -172,6 +172,14 @@ class contextual_option_parser { } private: + void add_contextual(typename option_type::context_type const& ctx, + typename option_type::value_type const& val) const { + if (!opt_.add_contextual(ctx, val)) { + throw std::runtime_error(fmt::format( + "duplicate context '{}' for option '{}'", cp_.to_string(ctx), name_)); + } + } + OptionType& opt_; ContextParser const& cp_; OptionParser const& op_; diff --git a/include/dwarfs/fragment_order_parser.h b/include/dwarfs/fragment_order_parser.h index dd9cd025..4c9f3e18 100644 --- a/include/dwarfs/fragment_order_parser.h +++ b/include/dwarfs/fragment_order_parser.h @@ -21,6 +21,7 @@ #pragma once +#include #include #include "dwarfs/options.h" diff --git a/include/dwarfs/options.h b/include/dwarfs/options.h index c6de7fb9..0ac40159 100644 --- a/include/dwarfs/options.h +++ b/include/dwarfs/options.h @@ -44,7 +44,7 @@ template struct categorized_option_policy { using ContextArgumentType = fragment_category; using ContextType = fragment_category::value_type; - using OptionType = T; + using ValueType = T; static ContextType context_from_arg(ContextArgumentType const& arg) { return arg.value(); @@ -53,9 +53,9 @@ struct categorized_option_policy { } // namespace detail -template +template using categorized_option = - contextual_option>; + contextual_option>; enum class mlock_mode { NONE, TRY, MUST }; diff --git a/src/dwarfs/block_compressor_parser.cpp b/src/dwarfs/block_compressor_parser.cpp new file mode 100644 index 00000000..4c6610a1 --- /dev/null +++ b/src/dwarfs/block_compressor_parser.cpp @@ -0,0 +1,35 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#include "dwarfs/block_compressor_parser.h" + +namespace dwarfs { + +block_compressor block_compressor_parser::parse(std::string_view arg) const { + return block_compressor(std::string(arg)); +} + +std::string +block_compressor_parser::to_string(block_compressor const& bc) const { + return bc.describe(); +} + +} // namespace dwarfs diff --git a/src/dwarfs/category_parser.cpp b/src/dwarfs/category_parser.cpp index fd02ea93..060fa020 100644 --- a/src/dwarfs/category_parser.cpp +++ b/src/dwarfs/category_parser.cpp @@ -33,7 +33,7 @@ std::vector category_parser::parse(std::string_view arg) const { if (!catmgr_) { throw std::runtime_error( - "cannot configure category-specific options without any categories"); + "cannot configure category-specific options without any categorizers"); } std::vector rv; diff --git a/src/dwarfs/compression/brotli.cpp b/src/dwarfs/compression/brotli.cpp index d3426b5b..f096ec89 100644 --- a/src/dwarfs/compression/brotli.cpp +++ b/src/dwarfs/compression/brotli.cpp @@ -75,6 +75,10 @@ class brotli_block_compressor final : public block_compressor::impl { compression_type type() const override { return compression_type::BROTLI; } + std::string describe() const override { + return fmt::format("brotli [quality={}, lgwin={}]", quality_, window_bits_); + } + private: uint32_t const quality_; uint32_t const window_bits_; diff --git a/src/dwarfs/compression/lz4.cpp b/src/dwarfs/compression/lz4.cpp index fde9b23d..e40e12b0 100644 --- a/src/dwarfs/compression/lz4.cpp +++ b/src/dwarfs/compression/lz4.cpp @@ -38,6 +38,8 @@ struct lz4_compression_policy { reinterpret_cast(src), reinterpret_cast(dest), folly::to(size), folly::to(destsize))); } + + static std::string describe(int /*level*/) { return "lz4"; } }; struct lz4hc_compression_policy { @@ -47,6 +49,10 @@ struct lz4hc_compression_policy { reinterpret_cast(src), reinterpret_cast(dest), folly::to(size), folly::to(destsize), level)); } + + static std::string describe(int level) { + return fmt::format("lz4hc [level={}]", level); + } }; template @@ -84,6 +90,8 @@ class lz4_block_compressor final : public block_compressor::impl { compression_type type() const override { return compression_type::LZ4; } + std::string describe() const override { return Policy::describe(level_); } + private: const int level_; }; diff --git a/src/dwarfs/compression/lzma.cpp b/src/dwarfs/compression/lzma.cpp index 689ebcfd..6941fc6b 100644 --- a/src/dwarfs/compression/lzma.cpp +++ b/src/dwarfs/compression/lzma.cpp @@ -71,6 +71,8 @@ class lzma_block_compressor final : public block_compressor::impl { compression_type type() const override { return compression_type::LZMA; } + std::string describe() const override { return description_; } + private: std::vector compress(const std::vector& data, const lzma_filter* filters) const; @@ -107,11 +109,16 @@ class lzma_block_compressor final : public block_compressor::impl { lzma_options_lzma opt_lzma_; std::array filters_; + std::string description_; }; lzma_block_compressor::lzma_block_compressor(unsigned level, bool extreme, const std::string& binary_mode, - unsigned dict_size) { + unsigned dict_size) + : description_{ + fmt::format("lzma [level={}, dict_size={}{}{}]", level, dict_size, + extreme ? ", extreme" : "", + binary_mode.empty() ? "" : ", binary=" + binary_mode)} { if (lzma_lzma_preset(&opt_lzma_, get_preset(level, extreme))) { DWARFS_THROW(runtime_error, "unsupported preset, possibly a bug"); } diff --git a/src/dwarfs/compression/null.cpp b/src/dwarfs/compression/null.cpp index 12bf0df5..bb5418f7 100644 --- a/src/dwarfs/compression/null.cpp +++ b/src/dwarfs/compression/null.cpp @@ -47,6 +47,8 @@ class null_block_compressor final : public block_compressor::impl { } compression_type type() const override { return compression_type::NONE; } + + std::string describe() const override { return "null"; } }; class null_block_decompressor final : public block_decompressor::impl { diff --git a/src/dwarfs/compression/zstd.cpp b/src/dwarfs/compression/zstd.cpp index 99a135eb..aff83759 100644 --- a/src/dwarfs/compression/zstd.cpp +++ b/src/dwarfs/compression/zstd.cpp @@ -63,6 +63,10 @@ class zstd_block_compressor final : public block_compressor::impl { compression_type type() const override { return compression_type::ZSTD; } + std::string describe() const override { + return fmt::format("zstd [level={}]", level_); + } + private: class scoped_context; diff --git a/src/mkdwarfs_main.cpp b/src/mkdwarfs_main.cpp index bc88109e..9d6b1c1a 100644 --- a/src/mkdwarfs_main.cpp +++ b/src/mkdwarfs_main.cpp @@ -51,6 +51,7 @@ #include #include "dwarfs/block_compressor.h" +#include "dwarfs/block_compressor_parser.h" #include "dwarfs/block_manager.h" #include "dwarfs/builtin_script.h" #include "dwarfs/categorizer.h" @@ -275,13 +276,14 @@ int mkdwarfs_main(int argc, sys_char** argv) { block_manager::config cfg; sys_string path_str, output_str; - std::string memory_limit, script_arg, compression, header, schema_compression, + std::string memory_limit, script_arg, header, schema_compression, metadata_compression, log_level_str, timestamp, time_resolution, progress_mode, recompress_opts, pack_metadata, file_hash_algo, debug_filter, max_similarity_size, input_list_str, chmod_str, categorizer_list_str; std::vector filter; - std::vector order, max_lookback_blocks, window_size, window_step; + std::vector order, max_lookback_blocks, window_size, window_step, + compression; size_t num_workers, num_scanner_workers; bool no_progress = false, remove_header = false, no_section_index = false, force_overwrite = false; @@ -292,11 +294,12 @@ int mkdwarfs_main(int argc, sys_char** argv) { integral_value_parser max_lookback_parser; integral_value_parser window_size_parser(6, 24); integral_value_parser window_step_parser(0, 8); + fragment_order_parser order_parser; + block_compressor_parser compressor_parser; scanner_options options; - auto order_desc = - "inode fragments order (" + fragment_order_parser::choices() + ")"; + auto order_desc = "inode fragments order (" + order_parser.choices() + ")"; auto progress_desc = "progress mode (" + (from(progress_modes) | get<0>() | unsplit(", ")) + ")"; @@ -430,7 +433,8 @@ int mkdwarfs_main(int argc, sys_char** argv) { po::options_description compressor_opts("Compressor options"); compressor_opts.add_options() ("compression,C", - po::value(&compression), + // po::value(&compression), // TODO + po::value>(&compression)->multitoken(), "block compression algorithm") ("schema-compression", po::value(&schema_compression), @@ -590,7 +594,7 @@ int mkdwarfs_main(int argc, sys_char** argv) { } if (!vm.count("compression")) { - compression = defaults.data_compression; + compression.push_back(std::string(defaults.data_compression)); } if (!vm.count("schema-compression")) { @@ -913,13 +917,14 @@ int mkdwarfs_main(int argc, sys_char** argv) { progress prog(std::move(updater), interval_ms); - block_compressor bc(compression); + block_compressor bc(compression.front()); // TODO block_compressor schema_bc(schema_compression); block_compressor metadata_bc(metadata_compression); auto min_memory_req = num_workers * (UINT64_C(1) << cfg.block_size_bits); - if (mem_limit < min_memory_req && compression != "null") { + // TODO: + if (mem_limit < min_memory_req /* && compression != "null" */) { LOG_WARN << "low memory limit (" << size_with_unit(mem_limit) << "), need " << size_with_unit(min_memory_req) << " to efficiently compress " << size_with_unit(UINT64_C(1) << cfg.block_size_bits) @@ -951,6 +956,77 @@ int mkdwarfs_main(int argc, sys_char** argv) { os = std::make_unique(); } + // TODO: the whole re-writing thing will be a bit weird in combination + // with categories; we'd likely require a "category"-section to be + // present (which we'll also require for bit-identical mode) + + if (!categorizer_list_str.empty()) { + std::vector categorizer_list; + boost::split(categorizer_list, categorizer_list_str, boost::is_any_of(",")); + + options.inode.categorizer_mgr = std::make_shared(lgr); + + for (auto const& name : categorizer_list) { + options.inode.categorizer_mgr->add(catreg.create(lgr, name, vm)); + } + } + + category_parser cp(options.inode.categorizer_mgr); + + try { + contextual_option_parser cop("--order", options.inode.fragment_order, cp, + order_parser); + cop.parse(order); + cop.dump(std::cerr); + } catch (std::exception const& e) { + LOG_ERROR << e.what(); + return 1; + } + + try { + categorized_option max_lookback_opt; + contextual_option_parser cop("--max-lookback-blocks", max_lookback_opt, cp, + max_lookback_parser); + cop.parse(max_lookback_blocks); + cop.dump(std::cerr); + } catch (std::exception const& e) { + LOG_ERROR << e.what(); + return 1; + } + + try { + categorized_option window_size_opt; + contextual_option_parser cop("--window-size", window_size_opt, cp, + window_size_parser); + cop.parse(window_size); + cop.dump(std::cerr); + } catch (std::exception const& e) { + LOG_ERROR << e.what(); + return 1; + } + + try { + categorized_option window_step_opt; + contextual_option_parser cop("--window-step", window_step_opt, cp, + window_step_parser); + cop.parse(window_step); + cop.dump(std::cerr); + } catch (std::exception const& e) { + LOG_ERROR << e.what(); + return 1; + } + + try { + categorized_option compression_opt; + contextual_option_parser cop("--compression", compression_opt, cp, + compressor_parser); + cop.parse(compression); + cop.dump(std::cerr); + } catch (std::exception const& e) { + LOG_ERROR << e.what(); + return 1; + } + filesystem_writer fsw(*os, lgr, wg_compress, prog, bc, schema_bc, metadata_bc, fswopts, header_ifs.get()); @@ -962,61 +1038,6 @@ int mkdwarfs_main(int argc, sys_char** argv) { fsw, rw_opts); wg_compress.wait(); } else { - if (!categorizer_list_str.empty()) { - std::vector categorizer_list; - boost::split(categorizer_list, categorizer_list_str, - boost::is_any_of(",")); - - options.inode.categorizer_mgr = - std::make_shared(lgr); - - for (auto const& name : categorizer_list) { - options.inode.categorizer_mgr->add(catreg.create(lgr, name, vm)); - } - } - - category_parser cp(options.inode.categorizer_mgr); - - try { - fragment_order_parser fop; - contextual_option_parser("--order", options.inode.fragment_order, cp, - fop) - .parse(order); - } catch (std::exception const& e) { - LOG_ERROR << e.what(); - return 1; - } - - try { - categorized_option max_lookback_opt; - contextual_option_parser("--max-lookback-blocks", max_lookback_opt, cp, - max_lookback_parser) - .parse(max_lookback_blocks); - } catch (std::exception const& e) { - LOG_ERROR << e.what(); - return 1; - } - - try { - categorized_option window_size_opt; - contextual_option_parser("--window-size", window_size_opt, cp, - window_size_parser) - .parse(window_size); - } catch (std::exception const& e) { - LOG_ERROR << e.what(); - return 1; - } - - try { - categorized_option window_step_opt; - contextual_option_parser("--window-step", window_step_opt, cp, - window_step_parser) - .parse(window_step); - } catch (std::exception const& e) { - LOG_ERROR << e.what(); - return 1; - } - scanner s(lgr, wg_scanner, cfg, entry_factory::create(), std::make_shared(), std::move(script), options);