From 01f20aa1f1f20c3c83956d4d81c1da3e40dbad39 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Thu, 22 Aug 2024 23:55:51 +0200 Subject: [PATCH] refactor(metadata): include self index in directories table --- doc/dwarfs-format.md | 24 +-- .../dwarfs/reader/internal/metadata_types.h | 13 +- src/reader/internal/metadata_types.cpp | 188 +++++++++++------- src/reader/internal/metadata_v2.cpp | 4 +- src/writer/internal/entry.cpp | 3 + src/writer/scanner.cpp | 2 + thrift/metadata.thrift | 16 +- 7 files changed, 159 insertions(+), 91 deletions(-) diff --git a/doc/dwarfs-format.md b/doc/dwarfs-format.md index 6643047d..bce27562 100644 --- a/doc/dwarfs-format.md +++ b/doc/dwarfs-format.md @@ -132,18 +132,18 @@ to each other: ╔════╗ ┌────────────────┐ │ S_IFDIR ──►┌───────────────────┐ │ ┌────────────────┴─┐ ║root╟──►│ name_index: 0 │ │ │ mode_index: 0 ├──────┐ └─►│ parent_entry: 0 │ ╚════╝ │ inode_num: 0 ├───────┴────────────►│ owner_index: 0 │ │ │ first_entry: 1 │ - ├────────────────┤ │ group_index: 0 │ │ ├──────────────────┤ - ┌───┤ name_index: 2 │ │ atime_offset: 0 │ │ │ parent_entry: 0 │ - ┌────┼───┤ inode_num: 5 ├───────┐ │ mtime_offset: 417 │ │ │ first_entry: 11 │ - │ │ ├────────────────┤ │ │ ctime_offset: 0 │ │ ├──────────────────┤ - │ ┌──┼───┤ name_index: 3 │ │ ├───────────────────┤ │ │ parent_entry: 5 │ - │ │ │ │ inode_num: 9 ├────┐ │ │ ... │ │ │ first_entry: 12 │ - │ │ │ ├────────────────┤ │ │ S_IFLNK ──►├───────────────────┤ │ ├──────────────────┤ - │ │ │ │ │ │ │ │ mode_index: 2 │ │ │ │ - │ │ │ │ ... │ │ └────────────►│ owner_index: 2 │ │ │ ... │ - │ │ │ │ │ │ │ group_index: 0 │ │ │ │ - │ │ │ └────────────────┘ │ │ atime_offset: 0 │ │ └──────────────────┘ - │ │ │ │ │ mtime_offset: 298 │ │ + ├────────────────┤ │ group_index: 0 │ │ | self_entry: 0 | + ┌───┤ name_index: 2 │ │ atime_offset: 0 │ │ ├──────────────────┤ + ┌────┼───┤ inode_num: 5 ├───────┐ │ mtime_offset: 417 │ │ │ parent_entry: 0 │ + │ │ ├────────────────┤ │ │ ctime_offset: 0 │ │ │ first_entry: 11 │ + │ ┌──┼───┤ name_index: 3 │ │ ├───────────────────┤ │ | self_entry: 1 | + │ │ │ │ inode_num: 9 ├────┐ │ │ ... │ │ ├──────────────────┤ + │ │ │ ├────────────────┤ │ │ S_IFLNK ──►├───────────────────┤ │ │ parent_entry: 5 │ + │ │ │ │ │ │ │ │ mode_index: 2 │ │ │ first_entry: 12 │ + │ │ │ │ ... │ │ └────────────►│ owner_index: 2 │ │ | self_entry: 7 | + │ │ │ │ │ │ │ group_index: 0 │ │ ├──────────────────┤ + │ │ │ └────────────────┘ │ │ atime_offset: 0 │ │ │ ... │ + │ │ │ │ │ mtime_offset: 298 │ │ └──────────────────┘ │ │ │ │ │ ctime_offset: 0 │ │ │ │ │ names[] │ ├───────────────────┤ │ modes[] │ │ │ ┌────────────┐ │ │ ... │ │ ┌─────────────┐ diff --git a/include/dwarfs/reader/internal/metadata_types.h b/include/dwarfs/reader/internal/metadata_types.h index e7884312..89d6499c 100644 --- a/include/dwarfs/reader/internal/metadata_types.h +++ b/include/dwarfs/reader/internal/metadata_types.h @@ -52,6 +52,11 @@ class global_metadata { using Meta = ::apache::thrift::frozen::MappedFrozen; + using directories_view = ::apache::thrift::frozen::Layout< + std::vector>::View; + using bundled_directories_view = + ::apache::thrift::frozen::Bundled; + global_metadata(logger& lgr, Meta const& meta); static void check_consistency(logger& lgr, Meta const& meta); @@ -65,14 +70,12 @@ class global_metadata { dwarfs::internal::string_table const& names() const { return names_; } - std::vector const& directories() const { - return directories_; - } + std::optional bundled_directories() const; private: Meta const& meta_; - std::vector const directories_; - std::vector const dir_self_index_; + std::optional const bundled_directories_; + directories_view const directories_; dwarfs::internal::string_table const names_; }; diff --git a/src/reader/internal/metadata_types.cpp b/src/reader/internal/metadata_types.cpp index 765d06a4..397c1fa4 100644 --- a/src/reader/internal/metadata_types.cpp +++ b/src/reader/internal/metadata_types.cpp @@ -20,6 +20,7 @@ */ #include +#include #include #include #include @@ -38,6 +39,7 @@ namespace dwarfs::reader::internal { using namespace dwarfs::internal; +using namespace ::apache::thrift; namespace { @@ -59,90 +61,127 @@ class stack_ctor { } }; -std::vector +std::optional unpack_directories(logger& lgr, global_metadata::Meta const& meta) { + auto has_self_entry = [&] { + auto layout = meta.findFirstOfType< + std::unique_ptr>>(); + return (*layout) + ->directoriesField.layout.itemField.layout.self_entryField.layout + .bits > 0; + }; + + auto opts = meta.options(); + auto dep = meta.dir_entries(); + + if ((!opts or !opts->packed_directories()) and (!dep or has_self_entry())) { + return std::nullopt; + } + + LOG_PROXY(debug_logger_policy, lgr); + + auto td = LOG_TIMED_DEBUG; + + auto dirent = *dep; + auto metadir = meta.directories(); + std::vector directories; - if (auto opts = meta.options(); opts and opts->packed_directories()) { - LOG_PROXY(debug_logger_policy, lgr); - - auto ti = LOG_TIMED_DEBUG; - - auto dirent = *meta.dir_entries(); - auto metadir = meta.directories(); - + if (opts->packed_directories()) { directories.resize(metadir.size()); // delta-decode first entries first - directories[0].first_entry() = metadir[0].first_entry(); + { + auto tt = LOG_TIMED_TRACE; - for (size_t i = 1; i < directories.size(); ++i) { - directories[i].first_entry() = - directories[i - 1].first_entry().value() + metadir[i].first_entry(); + directories[0].first_entry() = metadir[0].first_entry(); + + for (size_t i = 1; i < directories.size(); ++i) { + directories[i].first_entry() = + directories[i - 1].first_entry().value() + metadir[i].first_entry(); + } + + tt << "delta-decoded " << directories.size() << " first entries"; } // then traverse to recover parent entries - std::queue queue; - queue.push(0); + { + auto tt = LOG_TIMED_TRACE; - while (!queue.empty()) { - auto parent = queue.front(); - queue.pop(); + std::queue queue; + queue.push(0); - auto p_ino = dirent[parent].inode_num(); + while (!queue.empty()) { + auto parent = queue.front(); + queue.pop(); - auto beg = directories[p_ino].first_entry().value(); - auto end = directories[p_ino + 1].first_entry().value(); + auto p_ino = dirent[parent].inode_num(); - for (auto e = beg; e < end; ++e) { - if (auto e_ino = dirent[e].inode_num(); - e_ino < (directories.size() - 1)) { - directories[e_ino].parent_entry() = parent; - queue.push(e); + auto beg = directories[p_ino].first_entry().value(); + auto end = directories[p_ino + 1].first_entry().value(); + + for (auto e = beg; e < end; ++e) { + if (auto e_ino = dirent[e].inode_num(); + e_ino < (directories.size() - 1)) { + directories[e_ino].parent_entry() = parent; + queue.push(e); + } } } - } - ti << "unpacked directories table"; + tt << "recovered " << directories.size() << " parent entries"; + } + } else { + auto tt = LOG_TIMED_TRACE; + + directories = metadir.thaw(); + + tt << "thawed " << directories.size() << " directories"; } - return directories; -} + // finally, set self entries + { + auto tt = LOG_TIMED_TRACE; -std::vector -build_dir_self_index(logger& lgr, global_metadata::Meta const& meta) { - std::vector index; - - if (auto dep = meta.dir_entries()) { - LOG_PROXY(debug_logger_policy, lgr); - - auto ti = LOG_TIMED_DEBUG; - - auto const dir_count = meta.directories().size() - 1; - - index.resize(dir_count); - - for (size_t i = 0; i < dep->size(); ++i) { - auto ino = (*dep)[i].inode_num(); - if (ino < dir_count) { - index[ino] = i; + for (size_t i = 0; i < dirent.size(); ++i) { + auto ino = dirent[i].inode_num(); + if (ino < directories.size()) { + directories[ino].self_entry() = i; } } - auto check_index [[maybe_unused]] = [&] { - auto tmp = index; - std::sort(tmp.begin(), tmp.end()); - std::adjacent_difference(tmp.begin(), tmp.end(), tmp.begin()); - return std::all_of(tmp.begin() + 1, tmp.end(), - [](auto i) { return i != 0; }); - }; - - assert(check_index()); - - ti << "built directory self index table (size: " << dir_count << ")"; + tt << "recoverd " << directories.size() << " self entries from " + << dirent.size() << " dir entries"; } - return index; + // freeze to save memory + auto view = [&] { + auto tt = LOG_TIMED_TRACE; + + auto v = frozen::freeze(directories); + + tt << "froze " << directories.size() << " directories (" + << size_with_unit(sizeof(thrift::metadata::directory) * + directories.size()) + << ")"; + + return v; + }(); + + auto l_old = meta.findFirstOfType< + std::unique_ptr>>(); + auto bits_per_dir_old = + (*l_old)->directoriesField.layout.itemField.layout.bits; + auto l_new = view.findFirstOfType>>>(); + auto bits_per_dir_new = (*l_new)->itemField.layout.bits; + + td << "unpacked directories table with " << directories.size() << " entries (" + << size_with_unit((bits_per_dir_old * directories.size() + 7) / 8) + << " -> " + << size_with_unit((bits_per_dir_new * directories.size() + 7) / 8) << ")"; + + return view; } // TODO: merge with inode_rank in metadata_v2 @@ -682,12 +721,18 @@ check_metadata(logger& lgr, global_metadata::Meta const& meta, bool check) { return meta; } +template +T unbundled(frozen::Bundled const& bundle) { + return bundle; +} + } // namespace global_metadata::global_metadata(logger& lgr, Meta const& meta) : meta_{meta} - , directories_{unpack_directories(lgr, meta_)} - , dir_self_index_{build_dir_self_index(lgr, meta_)} + , bundled_directories_{unpack_directories(lgr, meta_)} + , directories_{bundled_directories_ ? unbundled(*bundled_directories_) + : meta_.directories()} , names_{meta_.compact_names() ? string_table(lgr, "names", *meta_.compact_names()) : string_table(meta_.names())} {} @@ -701,18 +746,27 @@ void global_metadata::check_consistency(logger& lgr) const { } uint32_t global_metadata::first_dir_entry(uint32_t ino) const { - return !directories_.empty() ? directories_[ino].first_entry().value() - : meta_.directories()[ino].first_entry(); + return directories_[ino].first_entry(); } uint32_t global_metadata::parent_dir_entry(uint32_t ino) const { - return !directories_.empty() ? directories_[ino].parent_entry().value() - : meta_.directories()[ino].parent_entry(); + return directories_[ino].parent_entry(); } uint32_t global_metadata::self_dir_entry(uint32_t ino) const { - return !dir_self_index_.empty() ? dir_self_index_[ino] - : meta_.entry_table_v2_2()[ino]; + if (!meta_.entry_table_v2_2().empty()) { + return meta_.entry_table_v2_2()[ino]; + } + + return directories_[ino].self_entry(); +} + +auto global_metadata::bundled_directories() const + -> std::optional { + if (bundled_directories_) { + return directories_; + } + return std::nullopt; } auto inode_view_impl::mode() const -> mode_type { diff --git a/src/reader/internal/metadata_v2.cpp b/src/reader/internal/metadata_v2.cpp index 263fbe50..f2b7dbd3 100644 --- a/src/reader/internal/metadata_v2.cpp +++ b/src/reader/internal/metadata_v2.cpp @@ -1453,8 +1453,8 @@ thrift::metadata::metadata metadata_::unpack_metadata() const { if (opts->packed_chunk_table().value()) { meta.chunk_table() = chunk_table_; } - if (opts->packed_directories().value()) { - meta.directories() = global_.directories(); + if (auto const& dirs = global_.bundled_directories()) { + meta.directories() = dirs->thaw(); } if (opts->packed_shared_files_table().value()) { meta.shared_files_table() = shared_files_; diff --git a/src/writer/internal/entry.cpp b/src/writer/internal/entry.cpp index 5fd7f26f..74cd0238 100644 --- a/src/writer/internal/entry.cpp +++ b/src/writer/internal/entry.cpp @@ -378,6 +378,9 @@ void dir::pack(thrift::metadata::metadata& mv2, d.parent_entry() = 0; } d.first_entry() = mv2.dir_entries()->size(); + auto se = entry_index(); + DWARFS_CHECK(se, "self entry index not set"); + d.self_entry() = *se; mv2.directories()->push_back(d); for (entry_ptr const& e : entries_) { e->set_entry_index(mv2.dir_entries()->size()); diff --git a/src/writer/scanner.cpp b/src/writer/scanner.cpp index b4dcd848..9a6243f0 100644 --- a/src/writer/scanner.cpp +++ b/src/writer/scanner.cpp @@ -201,6 +201,7 @@ class save_directories_visitor : public visitor_base { thrift::metadata::directory dummy; dummy.parent_entry() = 0; dummy.first_entry() = mv2.dir_entries()->size(); + dummy.self_entry() = 0; mv2.directories()->push_back(dummy); directories_.clear(); @@ -939,6 +940,7 @@ void scanner_::scan( for (auto& d : mv2.directories().value()) { d.parent_entry() = 0; // this will be recovered + d.self_entry() = 0; // this will be recovered auto delta = d.first_entry().value() - last_first_entry; last_first_entry = d.first_entry().value(); d.first_entry() = delta; diff --git a/thrift/metadata.thrift b/thrift/metadata.thrift index 5f95b50e..1757754b 100644 --- a/thrift/metadata.thrift +++ b/thrift/metadata.thrift @@ -62,17 +62,23 @@ struct chunk { * .. * dir_entries[directory[inode + 1].first_entry - 1] * - * Note that the `first_entry` fields are stored delta-compressed - * as of v2.3 and must be unpacked before using. Also note that - * the `parent_entry` fields are all set to zero as of v2.3. The - * `parent_entry` information can easily and quickly be built by - * traversing the `dir_entries` using the unpacked `first_entry` + * Note that as of v2.3, directory entries can be stored "packed", in + * which case only the `first_entry` fields are populated and stored + * delta-compressed. The `first_entry` field must be unpacked before + * using and the `parent_entry` and `self_entry` fields must be built + * by traversing the `dir_entries` using the unpacked `first_entry` * fields. */ struct directory { 1: UInt32 parent_entry // indexes into `dir_entries` 2: UInt32 first_entry // indexes into `dir_entries` + + //==========================================================// + // fields added with dwarfs-0.11.0, file system version 2.5 // + //==========================================================// + + 3: UInt32 self_entry // indexes into `dir_entries` } /**