From 7b38c0744fe5eecb60242709931e0717c32691eb Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Fri, 19 Mar 2021 12:04:11 +0100 Subject: [PATCH] Flexible metadata packing --- include/dwarfs/options.h | 3 ++ src/dwarfs/metadata_v2.cpp | 88 ++++++++++++++++++++++++++------------ src/dwarfs/scanner.cpp | 42 +++++++++++------- src/mkdwarfs.cpp | 29 ++++++++++++- test/dwarfs.cpp | 21 ++++++--- thrift/metadata.thrift | 23 +++++++--- 6 files changed, 148 insertions(+), 58 deletions(-) diff --git a/include/dwarfs/options.h b/include/dwarfs/options.h index c8c6237e..eea4e4c4 100644 --- a/include/dwarfs/options.h +++ b/include/dwarfs/options.h @@ -74,6 +74,9 @@ struct scanner_options { bool with_specials{false}; uint32_t time_resolution_sec{1}; inode_options inode; + bool pack_chunk_table{false}; + bool pack_directories{false}; + bool pack_shared_files_table{false}; }; struct rewrite_options { diff --git a/src/dwarfs/metadata_v2.cpp b/src/dwarfs/metadata_v2.cpp index 4142c4a5..ba524dff 100644 --- a/src/dwarfs/metadata_v2.cpp +++ b/src/dwarfs/metadata_v2.cpp @@ -35,6 +35,8 @@ #include #include +#include + #include #include #include @@ -125,11 +127,15 @@ class metadata_ final : public metadata_v2::impl { , nlinks_(build_nlinks(options)) , chunk_table_(unpack_chunk_table()) , directories_storage_(unpack_directories()) - , directories_(meta_.dir_entries() ? directories_storage_.data() - : nullptr) + , directories_(directories_storage_.empty() ? nullptr + : directories_storage_.data()) , shared_files_(decompress_shared_files()) , unique_files_(dev_inode_offset_ - file_inode_offset_ - - shared_files_.size()) + (shared_files_.empty() + ? meta_.shared_files_table() + ? meta_.shared_files_table()->size() + : 0 + : shared_files_.size())) , options_(options) { if (static_cast(meta_.directories().size() - 1) != symlink_inode_offset_) { @@ -350,22 +356,30 @@ class metadata_ final : public metadata_v2::impl { return chunk_table_.empty() ? meta_.chunk_table()[ino] : chunk_table_[ino]; } + int file_inode_to_chunk_index(int inode) const { + inode -= file_inode_offset_; + + if (inode >= unique_files_) { + inode -= unique_files_; + + if (!shared_files_.empty()) { + if (inode < static_cast(shared_files_.size())) { + inode = shared_files_[inode] + unique_files_; + } + } else if (auto sfp = meta_.shared_files_table()) { + if (inode < static_cast(sfp->size())) { + inode = (*sfp)[inode] + unique_files_; + } + } + } + + return inode; + } + std::optional get_chunk_range(int inode) const { std::optional rv; - inode -= file_inode_offset_; - - if (!shared_files_.empty()) { - if (inode >= unique_files_) { - inode -= unique_files_; - - if (inode >= static_cast(shared_files_.size())) { - return rv; - } - - inode = shared_files_[inode] + unique_files_; - } - } + inode = file_inode_to_chunk_index(inode); if (inode >= 0 && inode < (static_cast(meta_.chunk_table().size()) - 1)) { @@ -439,7 +453,7 @@ class metadata_ final : public metadata_v2::impl { std::vector unpack_chunk_table() const { std::vector chunk_table; - if (meta_.dir_entries()) { + if (auto opts = meta_.options(); opts and opts->packed_chunk_table()) { chunk_table.resize(meta_.chunk_table().size()); std::partial_sum(meta_.chunk_table().begin(), meta_.chunk_table().end(), chunk_table.begin()); @@ -451,8 +465,8 @@ class metadata_ final : public metadata_v2::impl { std::vector unpack_directories() const { std::vector directories; - if (auto dep = meta_.dir_entries()) { - auto dirent = *dep; + if (auto opts = meta_.options(); opts and opts->packed_directories()) { + auto dirent = *meta_.dir_entries(); auto metadir = meta_.directories(); { @@ -503,8 +517,9 @@ class metadata_ final : public metadata_v2::impl { std::vector decompress_shared_files() const { std::vector decompressed; - if (auto sfp = meta_.shared_files_table()) { - if (!sfp->empty()) { + if (auto opts = meta_.options(); + opts and opts->packed_shared_files_table()) { + if (auto sfp = meta_.shared_files_table(); sfp and !sfp->empty()) { auto ti = LOG_TIMED_DEBUG; auto size = std::accumulate(sfp->begin(), sfp->end(), 2 * sfp->size()); @@ -656,6 +671,22 @@ void metadata_::dump( os << "block size: " << stbuf.f_bsize << std::endl; os << "inode count: " << stbuf.f_files << std::endl; os << "original filesystem size: " << stbuf.f_blocks << std::endl; + if (auto opt = meta_.options()) { + std::vector options; + auto boolopt = [&](auto const& name, bool value) { + if (value) { + options.push_back(name); + } + }; + boolopt("mtime_only", opt->mtime_only()); + boolopt("packed_chunk_table", opt->packed_chunk_table()); + boolopt("packed_directories", opt->packed_directories()); + boolopt("packed_shared_files_table", opt->packed_shared_files_table()); + os << "options: " << boost::join(options, "\n ") << std::endl; + if (auto res = opt->time_resolution_sec()) { + os << "time resolution: " << *res << " seconds" << std::endl; + } + } } if (detail_level > 1) { @@ -681,9 +712,13 @@ void metadata_::dump( os << "dir_entries: " << de->size() << std::endl; } if (auto sfp = meta_.shared_files_table()) { - os << "compressed shared_files_table: " << sfp->size() << std::endl; - os << "decompressed shared_files_table: " << shared_files_.size() - << std::endl; + if (meta_.options()->packed_shared_files_table()) { + os << "compressed shared_files_table: " << sfp->size() << std::endl; + os << "decompressed shared_files_table: " << shared_files_.size() + << std::endl; + } else { + os << "shared_files_table: " << sfp->size() << std::endl; + } os << "unique files: " << unique_files_ << std::endl; } @@ -870,10 +905,7 @@ void metadata_::walk_data_order_impl( for (size_t ix = 0; ix < first_chunk_block.size(); ++ix) { int ino = (*dep)[ix].inode_num(); if (ino >= file_inode_offset_ and ino < dev_inode_offset_) { - ino -= file_inode_offset_; - if (ino >= unique_files_) { - ino = shared_files_[ino - unique_files_] + unique_files_; - } + ino = file_inode_to_chunk_index(ino); if (auto beg = chunk_table_lookup(ino); beg != chunk_table_lookup(ino + 1)) { first_chunk_block[ix] = meta_.chunks()[beg].block(); diff --git a/src/dwarfs/scanner.cpp b/src/dwarfs/scanner.cpp index a40100b0..542a903c 100644 --- a/src/dwarfs/scanner.cpp +++ b/src/dwarfs/scanner.cpp @@ -322,8 +322,8 @@ class save_shared_files_visitor : public visitor_base { } } - std::vector& get_compressed_shared_files() { - if (!shared_files_.empty() && !compressed_) { + void pack_shared_files() { + if (!shared_files_.empty()) { DWARFS_CHECK(std::is_sorted(shared_files_.begin(), shared_files_.end()), "shared files vector not sorted"); std::vector compressed; @@ -350,14 +350,13 @@ class save_shared_files_visitor : public visitor_base { shared_files_.swap(compressed); } - - return shared_files_; } + std::vector& get_shared_files() { return shared_files_; } + private: uint32_t const begin_shared_; uint32_t const num_unique_; - bool compressed_{false}; std::vector shared_files_; }; @@ -705,30 +704,41 @@ void scanner_::scan(filesystem_writer& fsw, root->accept(sdv); sdv.pack(mv2, ge_data); - // pack directories - uint32_t last_first_entry = 0; - for (auto& d : mv2.directories) { - d.parent_entry = 0; // this will be recovered - auto delta = d.first_entry - last_first_entry; - last_first_entry = d.first_entry; - d.first_entry = delta; + if (options_.pack_directories) { + // pack directories + uint32_t last_first_entry = 0; + + for (auto& d : mv2.directories) { + d.parent_entry = 0; // this will be recovered + auto delta = d.first_entry - last_first_entry; + last_first_entry = d.first_entry; + d.first_entry = delta; + } } - // delta-compress chunk table - std::adjacent_difference(mv2.chunk_table.begin(), mv2.chunk_table.end(), - mv2.chunk_table.begin()); + if (options_.pack_chunk_table) { + // delta-compress chunk table + std::adjacent_difference(mv2.chunk_table.begin(), mv2.chunk_table.end(), + mv2.chunk_table.begin()); + } LOG_INFO << "saving shared files table..."; save_shared_files_visitor ssfv(first_file_inode, first_device_inode, fdv.num_unique()); root->accept(ssfv); - mv2.shared_files_table_ref() = std::move(ssfv.get_compressed_shared_files()); + if (options_.pack_shared_files_table) { + ssfv.pack_shared_files(); + } + mv2.shared_files_table_ref() = std::move(ssfv.get_shared_files()); thrift::metadata::fs_options fsopts; fsopts.mtime_only = !options_.keep_all_times; if (options_.time_resolution_sec > 1) { fsopts.time_resolution_sec_ref() = options_.time_resolution_sec; } + fsopts.packed_chunk_table = options_.pack_chunk_table; + fsopts.packed_directories = options_.pack_directories; + fsopts.packed_shared_files_table = options_.pack_shared_files_table; mv2.uids = ge_data.get_uids(); mv2.gids = ge_data.get_gids(); diff --git a/src/mkdwarfs.cpp b/src/mkdwarfs.cpp index 727dc96e..d8f3dc7f 100644 --- a/src/mkdwarfs.cpp +++ b/src/mkdwarfs.cpp @@ -331,7 +331,7 @@ int mkdwarfs(int argc, char** argv) { block_manager::config cfg; std::string path, output, memory_limit, script_arg, compression, schema_compression, metadata_compression, log_level_str, timestamp, - time_resolution, order, progress_mode, recompress_opts; + time_resolution, order, progress_mode, recompress_opts, pack_metadata; size_t num_workers, max_scanner_workers; bool no_progress = false; unsigned level; @@ -395,6 +395,9 @@ int mkdwarfs(int argc, char** argv) { ("metadata-compression", po::value(&metadata_compression), "metadata compression algorithm") + ("pack-metadata", + po::value(&pack_metadata)->default_value("all"), + "pack certain metadata elements (none, chunk_table, directories, shared_files, all)") ("recompress", po::value(&recompress_opts)->implicit_value("all"), "recompress an existing filesystem (none, block, metadata, all)") @@ -725,6 +728,30 @@ int mkdwarfs(int argc, char** argv) { return 1; } + if (!pack_metadata.empty() and pack_metadata != "none") { + if (pack_metadata == "all") { + options.pack_chunk_table = true; + options.pack_directories = true; + options.pack_shared_files_table = true; + } else { + std::vector pack_opts; + boost::split(pack_opts, pack_metadata, boost::is_any_of(",")); + for (auto const& opt : pack_opts) { + if (opt == "chunk_table") { + options.pack_chunk_table = true; + } else if (opt == "directories") { + options.pack_directories = true; + } else if (opt == "shared_files") { + options.pack_shared_files_table = true; + } else { + std::cerr << "error: the argument ('" << opt + << "') to '--pack-metadata' is invalid" << std::endl; + return 1; + } + } + } + } + unsigned interval_ms = pg_mode == console_writer::NONE || pg_mode == console_writer::SIMPLE ? 2000 diff --git a/test/dwarfs.cpp b/test/dwarfs.cpp index 9e7f6501..f4bb5b9d 100644 --- a/test/dwarfs.cpp +++ b/test/dwarfs.cpp @@ -194,7 +194,9 @@ void basic_end_to_end_test(std::string const& compressor, unsigned block_size_bits, file_order_mode file_order, bool with_devices, bool with_specials, bool set_uid, bool set_gid, bool set_time, bool keep_all_times, - bool enable_nlink) { + bool enable_nlink, bool pack_chunk_table, + bool pack_directories, + bool pack_shared_files_table) { block_manager::config cfg; scanner_options options; @@ -207,6 +209,9 @@ void basic_end_to_end_test(std::string const& compressor, options.inode.with_similarity = file_order == file_order_mode::SIMILARITY; options.inode.with_nilsimsa = file_order == file_order_mode::NILSIMSA; options.keep_all_times = keep_all_times; + options.pack_chunk_table = pack_chunk_table; + options.pack_directories = pack_directories; + options.pack_shared_files_table = pack_shared_files_table; if (set_uid) { options.uid = 0; @@ -513,8 +518,9 @@ class compression_test : public testing::TestWithParam< std::tuple> {}; -class scanner_test : public testing::TestWithParam< - std::tuple> { +class scanner_test + : public testing::TestWithParam> { }; TEST_P(compression_test, end_to_end) { @@ -526,16 +532,18 @@ TEST_P(compression_test, end_to_end) { } basic_end_to_end_test(compressor, block_size_bits, file_order, true, true, - false, false, false, false, false); + false, false, false, false, false, true, true, true); } TEST_P(scanner_test, end_to_end) { auto [with_devices, with_specials, set_uid, set_gid, set_time, keep_all_times, - enable_nlink] = GetParam(); + enable_nlink, pack_chunk_table, pack_directories, + pack_shared_files_table] = GetParam(); basic_end_to_end_test(compressions[0], 15, file_order_mode::NONE, with_devices, with_specials, set_uid, set_gid, set_time, - keep_all_times, enable_nlink); + keep_all_times, enable_nlink, pack_chunk_table, + pack_directories, pack_shared_files_table); } INSTANTIATE_TEST_SUITE_P( @@ -549,5 +557,6 @@ INSTANTIATE_TEST_SUITE_P( INSTANTIATE_TEST_SUITE_P( dwarfs, scanner_test, ::testing::Combine(::testing::Bool(), ::testing::Bool(), ::testing::Bool(), + ::testing::Bool(), ::testing::Bool(), ::testing::Bool(), ::testing::Bool(), ::testing::Bool(), ::testing::Bool(), ::testing::Bool())); diff --git a/thrift/metadata.thrift b/thrift/metadata.thrift index 7de6f6ad..b83175de 100644 --- a/thrift/metadata.thrift +++ b/thrift/metadata.thrift @@ -136,6 +136,10 @@ struct fs_options { // time base and offsets are stored with this resolution // 1 = seconds, 60 = minutes, 3600 = hours, ... 2: optional UInt32 time_resolution_sec, + + 3: required bool packed_chunk_table, + 4: required bool packed_directories, + 5: required bool packed_shared_files_table, } /** @@ -168,8 +172,8 @@ struct metadata { * same for all directories. * * Note that this list is stored in a packed format as of v2.3 - * and needs to be unpacked before use. See the documentation - * for the `directory` struct. + * if `options.packed_directories` is `true` and must be unpacked + * before use. See the documentation for the `directory` struct. */ 2: required list directories, @@ -201,8 +205,9 @@ struct metadata { * There's one extra sentinel item at the end that points to the * end of `chunks`, so chunk lookups work the same for all inodes. * - * Note that this is stored delta-compressed as of v2.3 and must - * be unpacked before using. + * Note that this list is stored delta-compressed as of v2.3 + * if `options.packed_chunk_table` is `true` and must be unpacked + * before use. */ 4: required list chunk_table, @@ -286,9 +291,13 @@ struct metadata { /** * Shared files mapping * - * Note that this table cannot be used directly and must first - * be unpacked. It is stored as number of repetitions per index, - * offset by 2 (the minimum number of repetitions), so e.g. + * Note that this list is stored in a packed format if + * `options.packed_shared_files_table` is `true` and must be + * unpacked before use. + * + * In packed format, it is stored as number of repetitions + * per index, offset by 2 (the minimum number of repetitions), + * so e.g. a packed list * * [0, 3, 1, 0, 1] *