mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-13 06:16:55 -04:00
refactor(metadata): include self index in directories table
This commit is contained in:
parent
682cf883a2
commit
01f20aa1f1
@ -132,18 +132,18 @@ to each other:
|
||||
╔════╗ ┌────────────────┐ │ S_IFDIR ──►┌───────────────────┐ │ ┌────────────────┴─┐
|
||||
║root╟──►│ name_index: 0 │ │ │ mode_index: 0 ├──────┐ └─►│ parent_entry: 0 │
|
||||
╚════╝ │ inode_num: 0 ├───────┴────────────►│ owner_index: 0 │ │ │ first_entry: 1 │
|
||||
├────────────────┤ │ group_index: 0 │ │ ├──────────────────┤
|
||||
┌───┤ name_index: 2 │ │ atime_offset: 0 │ │ │ parent_entry: 0 │
|
||||
┌────┼───┤ inode_num: 5 ├───────┐ │ mtime_offset: 417 │ │ │ first_entry: 11 │
|
||||
│ │ ├────────────────┤ │ │ ctime_offset: 0 │ │ ├──────────────────┤
|
||||
│ ┌──┼───┤ name_index: 3 │ │ ├───────────────────┤ │ │ parent_entry: 5 │
|
||||
│ │ │ │ inode_num: 9 ├────┐ │ │ ... │ │ │ first_entry: 12 │
|
||||
│ │ │ ├────────────────┤ │ │ S_IFLNK ──►├───────────────────┤ │ ├──────────────────┤
|
||||
│ │ │ │ │ │ │ │ mode_index: 2 │ │ │ │
|
||||
│ │ │ │ ... │ │ └────────────►│ owner_index: 2 │ │ │ ... │
|
||||
│ │ │ │ │ │ │ group_index: 0 │ │ │ │
|
||||
│ │ │ └────────────────┘ │ │ atime_offset: 0 │ │ └──────────────────┘
|
||||
│ │ │ │ │ mtime_offset: 298 │ │
|
||||
├────────────────┤ │ group_index: 0 │ │ | self_entry: 0 |
|
||||
┌───┤ name_index: 2 │ │ atime_offset: 0 │ │ ├──────────────────┤
|
||||
┌────┼───┤ inode_num: 5 ├───────┐ │ mtime_offset: 417 │ │ │ parent_entry: 0 │
|
||||
│ │ ├────────────────┤ │ │ ctime_offset: 0 │ │ │ first_entry: 11 │
|
||||
│ ┌──┼───┤ name_index: 3 │ │ ├───────────────────┤ │ | self_entry: 1 |
|
||||
│ │ │ │ inode_num: 9 ├────┐ │ │ ... │ │ ├──────────────────┤
|
||||
│ │ │ ├────────────────┤ │ │ S_IFLNK ──►├───────────────────┤ │ │ parent_entry: 5 │
|
||||
│ │ │ │ │ │ │ │ mode_index: 2 │ │ │ first_entry: 12 │
|
||||
│ │ │ │ ... │ │ └────────────►│ owner_index: 2 │ │ | self_entry: 7 |
|
||||
│ │ │ │ │ │ │ group_index: 0 │ │ ├──────────────────┤
|
||||
│ │ │ └────────────────┘ │ │ atime_offset: 0 │ │ │ ... │
|
||||
│ │ │ │ │ mtime_offset: 298 │ │ └──────────────────┘
|
||||
│ │ │ │ │ ctime_offset: 0 │ │
|
||||
│ │ │ names[] │ ├───────────────────┤ │ modes[]
|
||||
│ │ │ ┌────────────┐ │ │ ... │ │ ┌─────────────┐
|
||||
|
@ -52,6 +52,11 @@ class global_metadata {
|
||||
using Meta =
|
||||
::apache::thrift::frozen::MappedFrozen<thrift::metadata::metadata>;
|
||||
|
||||
using directories_view = ::apache::thrift::frozen::Layout<
|
||||
std::vector<thrift::metadata::directory>>::View;
|
||||
using bundled_directories_view =
|
||||
::apache::thrift::frozen::Bundled<directories_view>;
|
||||
|
||||
global_metadata(logger& lgr, Meta const& meta);
|
||||
|
||||
static void check_consistency(logger& lgr, Meta const& meta);
|
||||
@ -65,14 +70,12 @@ class global_metadata {
|
||||
|
||||
dwarfs::internal::string_table const& names() const { return names_; }
|
||||
|
||||
std::vector<thrift::metadata::directory> const& directories() const {
|
||||
return directories_;
|
||||
}
|
||||
std::optional<directories_view> bundled_directories() const;
|
||||
|
||||
private:
|
||||
Meta const& meta_;
|
||||
std::vector<thrift::metadata::directory> const directories_;
|
||||
std::vector<uint32_t> const dir_self_index_;
|
||||
std::optional<bundled_directories_view> const bundled_directories_;
|
||||
directories_view const directories_;
|
||||
dwarfs::internal::string_table const names_;
|
||||
};
|
||||
|
||||
|
@ -20,6 +20,7 @@
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include <bit>
|
||||
#include <cassert>
|
||||
#include <numeric>
|
||||
#include <queue>
|
||||
@ -38,6 +39,7 @@
|
||||
namespace dwarfs::reader::internal {
|
||||
|
||||
using namespace dwarfs::internal;
|
||||
using namespace ::apache::thrift;
|
||||
|
||||
namespace {
|
||||
|
||||
@ -59,90 +61,127 @@ class stack_ctor {
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<thrift::metadata::directory>
|
||||
std::optional<global_metadata::bundled_directories_view>
|
||||
unpack_directories(logger& lgr, global_metadata::Meta const& meta) {
|
||||
auto has_self_entry = [&] {
|
||||
auto layout = meta.findFirstOfType<
|
||||
std::unique_ptr<frozen::Layout<thrift::metadata::metadata>>>();
|
||||
return (*layout)
|
||||
->directoriesField.layout.itemField.layout.self_entryField.layout
|
||||
.bits > 0;
|
||||
};
|
||||
|
||||
auto opts = meta.options();
|
||||
auto dep = meta.dir_entries();
|
||||
|
||||
if ((!opts or !opts->packed_directories()) and (!dep or has_self_entry())) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
LOG_PROXY(debug_logger_policy, lgr);
|
||||
|
||||
auto td = LOG_TIMED_DEBUG;
|
||||
|
||||
auto dirent = *dep;
|
||||
auto metadir = meta.directories();
|
||||
|
||||
std::vector<thrift::metadata::directory> directories;
|
||||
|
||||
if (auto opts = meta.options(); opts and opts->packed_directories()) {
|
||||
LOG_PROXY(debug_logger_policy, lgr);
|
||||
|
||||
auto ti = LOG_TIMED_DEBUG;
|
||||
|
||||
auto dirent = *meta.dir_entries();
|
||||
auto metadir = meta.directories();
|
||||
|
||||
if (opts->packed_directories()) {
|
||||
directories.resize(metadir.size());
|
||||
|
||||
// delta-decode first entries first
|
||||
directories[0].first_entry() = metadir[0].first_entry();
|
||||
{
|
||||
auto tt = LOG_TIMED_TRACE;
|
||||
|
||||
for (size_t i = 1; i < directories.size(); ++i) {
|
||||
directories[i].first_entry() =
|
||||
directories[i - 1].first_entry().value() + metadir[i].first_entry();
|
||||
directories[0].first_entry() = metadir[0].first_entry();
|
||||
|
||||
for (size_t i = 1; i < directories.size(); ++i) {
|
||||
directories[i].first_entry() =
|
||||
directories[i - 1].first_entry().value() + metadir[i].first_entry();
|
||||
}
|
||||
|
||||
tt << "delta-decoded " << directories.size() << " first entries";
|
||||
}
|
||||
|
||||
// then traverse to recover parent entries
|
||||
std::queue<uint32_t> queue;
|
||||
queue.push(0);
|
||||
{
|
||||
auto tt = LOG_TIMED_TRACE;
|
||||
|
||||
while (!queue.empty()) {
|
||||
auto parent = queue.front();
|
||||
queue.pop();
|
||||
std::queue<uint32_t> queue;
|
||||
queue.push(0);
|
||||
|
||||
auto p_ino = dirent[parent].inode_num();
|
||||
while (!queue.empty()) {
|
||||
auto parent = queue.front();
|
||||
queue.pop();
|
||||
|
||||
auto beg = directories[p_ino].first_entry().value();
|
||||
auto end = directories[p_ino + 1].first_entry().value();
|
||||
auto p_ino = dirent[parent].inode_num();
|
||||
|
||||
for (auto e = beg; e < end; ++e) {
|
||||
if (auto e_ino = dirent[e].inode_num();
|
||||
e_ino < (directories.size() - 1)) {
|
||||
directories[e_ino].parent_entry() = parent;
|
||||
queue.push(e);
|
||||
auto beg = directories[p_ino].first_entry().value();
|
||||
auto end = directories[p_ino + 1].first_entry().value();
|
||||
|
||||
for (auto e = beg; e < end; ++e) {
|
||||
if (auto e_ino = dirent[e].inode_num();
|
||||
e_ino < (directories.size() - 1)) {
|
||||
directories[e_ino].parent_entry() = parent;
|
||||
queue.push(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ti << "unpacked directories table";
|
||||
tt << "recovered " << directories.size() << " parent entries";
|
||||
}
|
||||
} else {
|
||||
auto tt = LOG_TIMED_TRACE;
|
||||
|
||||
directories = metadir.thaw();
|
||||
|
||||
tt << "thawed " << directories.size() << " directories";
|
||||
}
|
||||
|
||||
return directories;
|
||||
}
|
||||
// finally, set self entries
|
||||
{
|
||||
auto tt = LOG_TIMED_TRACE;
|
||||
|
||||
std::vector<uint32_t>
|
||||
build_dir_self_index(logger& lgr, global_metadata::Meta const& meta) {
|
||||
std::vector<uint32_t> index;
|
||||
|
||||
if (auto dep = meta.dir_entries()) {
|
||||
LOG_PROXY(debug_logger_policy, lgr);
|
||||
|
||||
auto ti = LOG_TIMED_DEBUG;
|
||||
|
||||
auto const dir_count = meta.directories().size() - 1;
|
||||
|
||||
index.resize(dir_count);
|
||||
|
||||
for (size_t i = 0; i < dep->size(); ++i) {
|
||||
auto ino = (*dep)[i].inode_num();
|
||||
if (ino < dir_count) {
|
||||
index[ino] = i;
|
||||
for (size_t i = 0; i < dirent.size(); ++i) {
|
||||
auto ino = dirent[i].inode_num();
|
||||
if (ino < directories.size()) {
|
||||
directories[ino].self_entry() = i;
|
||||
}
|
||||
}
|
||||
|
||||
auto check_index [[maybe_unused]] = [&] {
|
||||
auto tmp = index;
|
||||
std::sort(tmp.begin(), tmp.end());
|
||||
std::adjacent_difference(tmp.begin(), tmp.end(), tmp.begin());
|
||||
return std::all_of(tmp.begin() + 1, tmp.end(),
|
||||
[](auto i) { return i != 0; });
|
||||
};
|
||||
|
||||
assert(check_index());
|
||||
|
||||
ti << "built directory self index table (size: " << dir_count << ")";
|
||||
tt << "recoverd " << directories.size() << " self entries from "
|
||||
<< dirent.size() << " dir entries";
|
||||
}
|
||||
|
||||
return index;
|
||||
// freeze to save memory
|
||||
auto view = [&] {
|
||||
auto tt = LOG_TIMED_TRACE;
|
||||
|
||||
auto v = frozen::freeze(directories);
|
||||
|
||||
tt << "froze " << directories.size() << " directories ("
|
||||
<< size_with_unit(sizeof(thrift::metadata::directory) *
|
||||
directories.size())
|
||||
<< ")";
|
||||
|
||||
return v;
|
||||
}();
|
||||
|
||||
auto l_old = meta.findFirstOfType<
|
||||
std::unique_ptr<frozen::Layout<thrift::metadata::metadata>>>();
|
||||
auto bits_per_dir_old =
|
||||
(*l_old)->directoriesField.layout.itemField.layout.bits;
|
||||
auto l_new = view.findFirstOfType<std::unique_ptr<
|
||||
frozen::Layout<std::vector<thrift::metadata::directory>>>>();
|
||||
auto bits_per_dir_new = (*l_new)->itemField.layout.bits;
|
||||
|
||||
td << "unpacked directories table with " << directories.size() << " entries ("
|
||||
<< size_with_unit((bits_per_dir_old * directories.size() + 7) / 8)
|
||||
<< " -> "
|
||||
<< size_with_unit((bits_per_dir_new * directories.size() + 7) / 8) << ")";
|
||||
|
||||
return view;
|
||||
}
|
||||
|
||||
// TODO: merge with inode_rank in metadata_v2
|
||||
@ -682,12 +721,18 @@ check_metadata(logger& lgr, global_metadata::Meta const& meta, bool check) {
|
||||
return meta;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T unbundled(frozen::Bundled<T> const& bundle) {
|
||||
return bundle;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
global_metadata::global_metadata(logger& lgr, Meta const& meta)
|
||||
: meta_{meta}
|
||||
, directories_{unpack_directories(lgr, meta_)}
|
||||
, dir_self_index_{build_dir_self_index(lgr, meta_)}
|
||||
, bundled_directories_{unpack_directories(lgr, meta_)}
|
||||
, directories_{bundled_directories_ ? unbundled(*bundled_directories_)
|
||||
: meta_.directories()}
|
||||
, names_{meta_.compact_names()
|
||||
? string_table(lgr, "names", *meta_.compact_names())
|
||||
: string_table(meta_.names())} {}
|
||||
@ -701,18 +746,27 @@ void global_metadata::check_consistency(logger& lgr) const {
|
||||
}
|
||||
|
||||
uint32_t global_metadata::first_dir_entry(uint32_t ino) const {
|
||||
return !directories_.empty() ? directories_[ino].first_entry().value()
|
||||
: meta_.directories()[ino].first_entry();
|
||||
return directories_[ino].first_entry();
|
||||
}
|
||||
|
||||
uint32_t global_metadata::parent_dir_entry(uint32_t ino) const {
|
||||
return !directories_.empty() ? directories_[ino].parent_entry().value()
|
||||
: meta_.directories()[ino].parent_entry();
|
||||
return directories_[ino].parent_entry();
|
||||
}
|
||||
|
||||
uint32_t global_metadata::self_dir_entry(uint32_t ino) const {
|
||||
return !dir_self_index_.empty() ? dir_self_index_[ino]
|
||||
: meta_.entry_table_v2_2()[ino];
|
||||
if (!meta_.entry_table_v2_2().empty()) {
|
||||
return meta_.entry_table_v2_2()[ino];
|
||||
}
|
||||
|
||||
return directories_[ino].self_entry();
|
||||
}
|
||||
|
||||
auto global_metadata::bundled_directories() const
|
||||
-> std::optional<directories_view> {
|
||||
if (bundled_directories_) {
|
||||
return directories_;
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
auto inode_view_impl::mode() const -> mode_type {
|
||||
|
@ -1453,8 +1453,8 @@ thrift::metadata::metadata metadata_<LoggerPolicy>::unpack_metadata() const {
|
||||
if (opts->packed_chunk_table().value()) {
|
||||
meta.chunk_table() = chunk_table_;
|
||||
}
|
||||
if (opts->packed_directories().value()) {
|
||||
meta.directories() = global_.directories();
|
||||
if (auto const& dirs = global_.bundled_directories()) {
|
||||
meta.directories() = dirs->thaw();
|
||||
}
|
||||
if (opts->packed_shared_files_table().value()) {
|
||||
meta.shared_files_table() = shared_files_;
|
||||
|
@ -378,6 +378,9 @@ void dir::pack(thrift::metadata::metadata& mv2,
|
||||
d.parent_entry() = 0;
|
||||
}
|
||||
d.first_entry() = mv2.dir_entries()->size();
|
||||
auto se = entry_index();
|
||||
DWARFS_CHECK(se, "self entry index not set");
|
||||
d.self_entry() = *se;
|
||||
mv2.directories()->push_back(d);
|
||||
for (entry_ptr const& e : entries_) {
|
||||
e->set_entry_index(mv2.dir_entries()->size());
|
||||
|
@ -201,6 +201,7 @@ class save_directories_visitor : public visitor_base {
|
||||
thrift::metadata::directory dummy;
|
||||
dummy.parent_entry() = 0;
|
||||
dummy.first_entry() = mv2.dir_entries()->size();
|
||||
dummy.self_entry() = 0;
|
||||
mv2.directories()->push_back(dummy);
|
||||
|
||||
directories_.clear();
|
||||
@ -939,6 +940,7 @@ void scanner_<LoggerPolicy>::scan(
|
||||
|
||||
for (auto& d : mv2.directories().value()) {
|
||||
d.parent_entry() = 0; // this will be recovered
|
||||
d.self_entry() = 0; // this will be recovered
|
||||
auto delta = d.first_entry().value() - last_first_entry;
|
||||
last_first_entry = d.first_entry().value();
|
||||
d.first_entry() = delta;
|
||||
|
@ -62,17 +62,23 @@ struct chunk {
|
||||
* ..
|
||||
* dir_entries[directory[inode + 1].first_entry - 1]
|
||||
*
|
||||
* Note that the `first_entry` fields are stored delta-compressed
|
||||
* as of v2.3 and must be unpacked before using. Also note that
|
||||
* the `parent_entry` fields are all set to zero as of v2.3. The
|
||||
* `parent_entry` information can easily and quickly be built by
|
||||
* traversing the `dir_entries` using the unpacked `first_entry`
|
||||
* Note that as of v2.3, directory entries can be stored "packed", in
|
||||
* which case only the `first_entry` fields are populated and stored
|
||||
* delta-compressed. The `first_entry` field must be unpacked before
|
||||
* using and the `parent_entry` and `self_entry` fields must be built
|
||||
* by traversing the `dir_entries` using the unpacked `first_entry`
|
||||
* fields.
|
||||
*/
|
||||
struct directory {
|
||||
1: UInt32 parent_entry // indexes into `dir_entries`
|
||||
|
||||
2: UInt32 first_entry // indexes into `dir_entries`
|
||||
|
||||
//==========================================================//
|
||||
// fields added with dwarfs-0.11.0, file system version 2.5 //
|
||||
//==========================================================//
|
||||
|
||||
3: UInt32 self_entry // indexes into `dir_entries`
|
||||
}
|
||||
|
||||
/**
|
||||
|
Loading…
x
Reference in New Issue
Block a user