From ad48cae7b19ba1c7cfe2e9bc49c43b5630473f26 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Tue, 16 Mar 2021 20:09:07 +0100 Subject: [PATCH] Unique files table, deprecate inode_v2_2, some cleanup --- include/dwarfs/entry.h | 2 +- include/dwarfs/metadata_types.h | 6 --- src/dwarfs.cpp | 3 +- src/dwarfs/entry.cpp | 7 +-- src/dwarfs/metadata_types.cpp | 25 +++-------- src/dwarfs/metadata_v2.cpp | 77 +++++++++++++++++---------------- src/dwarfs/scanner.cpp | 33 +++++++++++--- test/dwarfs.cpp | 8 ++-- thrift/metadata.thrift | 9 +++- 9 files changed, 87 insertions(+), 83 deletions(-) diff --git a/include/dwarfs/entry.h b/include/dwarfs/entry.h index 7576c6a4..ab7b7fc3 100644 --- a/include/dwarfs/entry.h +++ b/include/dwarfs/entry.h @@ -124,7 +124,7 @@ class file : public entry { void hardlink(file* other, progress& prog); uint64_t raw_inode_num() const; unsigned num_hard_links() const; - uint32_t content_index() const; + uint32_t unique_file_id() const; private: struct data { diff --git a/include/dwarfs/metadata_types.h b/include/dwarfs/metadata_types.h index 44c4b4a9..af53e32d 100644 --- a/include/dwarfs/metadata_types.h +++ b/include/dwarfs/metadata_types.h @@ -69,10 +69,6 @@ class inode_view Meta const* meta_; }; -/** - * THIS *MUST* BE CONSTRUCTIBLE FROM ONLY AN INODE NUMBER (NOT EVEN AN - * INODE_VIEW) - */ class directory_view : public ::apache::thrift::frozen::View { using DirView = ::apache::thrift::frozen::View; @@ -124,8 +120,6 @@ class dir_entry_view { bool is_root() const; - // TODO: remove? - // std::optional directory() const; std::optional parent() const; std::string path() const; diff --git a/src/dwarfs.cpp b/src/dwarfs.cpp index 0b01d30f..69666aa0 100644 --- a/src/dwarfs.cpp +++ b/src/dwarfs.cpp @@ -282,8 +282,7 @@ void op_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info* fi) { } else if (fi->flags & (O_APPEND | O_CREAT | O_TRUNC)) { err = EACCES; } else { - fi->fh = FUSE_ROOT_ID + - entry->content_index(); // <<---- THIS IS NOT THE INODE!!!! + fi->fh = FUSE_ROOT_ID + entry->inode_num(); fi->direct_io = !s_opts.cache_files; fi->keep_cache = s_opts.cache_files; fuse_reply_open(req, fi); diff --git a/src/dwarfs/entry.cpp b/src/dwarfs/entry.cpp index c720bf0f..a6ad34eb 100644 --- a/src/dwarfs/entry.cpp +++ b/src/dwarfs/entry.cpp @@ -109,11 +109,6 @@ void entry::pack(thrift::metadata::inode_data& entry_v2, entry_v2.atime_offset = data.get_atime_offset(stat_.st_atime); entry_v2.mtime_offset = data.get_mtime_offset(stat_.st_mtime); entry_v2.ctime_offset = data.get_ctime_offset(stat_.st_ctime); - if (auto fp = dynamic_cast(this)) { - entry_v2.content_index = fp->content_index(); - } else { - entry_v2.content_index = inode_num(); - } } entry::type_t file::type() const { return E_FILE; } @@ -189,7 +184,7 @@ void file::scan(os_access& os, progress& prog) { } } -uint32_t file::content_index() const { return inode_->num(); } +uint32_t file::unique_file_id() const { return inode_->num(); } uint64_t file::raw_inode_num() const { return status().st_ino; } diff --git a/src/dwarfs/metadata_types.cpp b/src/dwarfs/metadata_types.cpp index 67d49921..87233828 100644 --- a/src/dwarfs/metadata_types.cpp +++ b/src/dwarfs/metadata_types.cpp @@ -54,32 +54,17 @@ inode_view dir_entry_view::inode() const { dev.inode_num(), meta_); }, [this](InodeView const& iv) { - return inode_view(iv, iv.content_index(), meta_); + return inode_view(iv, iv.inode_v2_2(), meta_); }, }, v_); } -// TODO: remove? -// std::optional dir_entry_view::directory() const { -// if (is_root()) { -// return std::nullopt; -// } -// -// auto dir_inode = parent_index_; -// -// if (auto de = meta_->dir_entries()) { -// dir_inode = (*de)[dir_inode].entry_index(); -// } -// -// return directory_view(dir_inode, meta_); -// } - bool dir_entry_view::is_root() const { return std::visit( overloaded{ [](DirEntryView const& dev) { return dev.inode_num() == 0; }, - [](InodeView const& iv) { return iv.content_index() == 0; }, + [](InodeView const& iv) { return iv.inode_v2_2() == 0; }, }, v_); } @@ -126,11 +111,11 @@ dir_entry_view::from_dir_entry_index(uint32_t self_index, Meta const* meta) { DWARFS_CHECK(self_index < meta->entries().size(), "self_index out of range"); auto iv = meta->entries()[self_index]; - DWARFS_CHECK(iv.content_index() < meta->directories().size(), + DWARFS_CHECK(iv.inode_v2_2() < meta->directories().size(), "parent_index out of range"); return dir_entry_view( iv, self_index, - meta->entry_table_v2_2()[meta->directories()[iv.content_index()] + meta->entry_table_v2_2()[meta->directories()[iv.inode_v2_2()] .parent_entry()], meta); } @@ -164,7 +149,7 @@ inode_view dir_entry_view::inode(uint32_t index, Meta const* meta) { DWARFS_CHECK(index < meta->entries().size(), "index out of range"); auto iv = meta->entries()[index]; - return inode_view(iv, iv.content_index(), meta); + return inode_view(iv, iv.inode_v2_2(), meta); } std::string dir_entry_view::path() const { diff --git a/src/dwarfs/metadata_v2.cpp b/src/dwarfs/metadata_v2.cpp index 769e2e25..ce2dfc4b 100644 --- a/src/dwarfs/metadata_v2.cpp +++ b/src/dwarfs/metadata_v2.cpp @@ -343,24 +343,35 @@ class metadata_ final : public metadata_v2::impl { std::string modestring(uint16_t mode) const; + std::optional get_chunk_range(int inode) const { + std::optional rv; + + inode -= file_index_offset_; + + if (auto uf = meta_.unique_files_table()) { + if (inode < 0 or inode >= static_cast(uf->size())) { + return rv; + } + + inode = (*uf)[inode]; + } + + if (inode >= 0 && + inode < (static_cast(meta_.chunk_table().size()) - 1)) { + uint32_t begin = meta_.chunk_table()[inode]; + uint32_t end = meta_.chunk_table()[inode + 1]; + rv = chunk_range(&meta_, begin, end); + } + + return rv; + } + size_t reg_file_size(inode_view entry) const { - auto inode = entry.content_index() - file_index_offset_; - uint32_t cur = meta_.chunk_table()[inode]; - uint32_t end = meta_.chunk_table()[inode + 1]; - if (cur > end) { - DWARFS_THROW(runtime_error, - fmt::format("invalid chunk range: [{0}..{1}]", cur, end)); - } - if (end > meta_.chunks().size()) { - DWARFS_THROW(runtime_error, - fmt::format("chunk index out of range: {0} > {1}", end, - meta_.chunks().size())); - } - size_t size = 0; - while (cur < end) { - size += meta_.chunks()[cur++].size(); - } - return size; + auto cr = get_chunk_range(entry.inode_num()); + DWARFS_CHECK(cr, "invalid chunk range"); + return std::accumulate( + cr->begin(), cr->end(), static_cast(0), + [](size_t s, chunk_view cv) { return s + cv.size(); }); } size_t file_size(inode_view entry, uint16_t mode) const { @@ -401,7 +412,7 @@ class metadata_ final : public metadata_v2::impl { } std::string_view link_value(inode_view entry) const { - return meta_.symlinks()[meta_.symlink_table()[entry.content_index() - + return meta_.symlinks()[meta_.symlink_table()[entry.inode_num() - symlink_table_offset_]]; } @@ -430,7 +441,7 @@ class metadata_ final : public metadata_v2::impl { } } else { for (auto e : meta_.entries()) { - auto index = int(e.content_index()) - file_index_offset_; + auto index = int(e.inode_v2_2()) - file_index_offset_; if (index >= 0 && index < int(nlinks.size())) { ++DWARFS_NOTHROW(nlinks.at(index)); } @@ -464,7 +475,7 @@ void metadata_::dump( std::function const& icb) const { auto inode_data = entry.inode(); auto mode = inode_data.mode(); - auto inode = inode_data.content_index(); // TODO: rename inode appropriately + auto inode = inode_data.inode_num(); // TODO: rename inode appropriately os << indent << " " << modestring(mode); @@ -473,9 +484,9 @@ void metadata_::dump( } if (S_ISREG(mode)) { - uint32_t beg = meta_.chunk_table()[inode - file_index_offset_]; - uint32_t end = meta_.chunk_table()[inode - file_index_offset_ + 1]; - os << " [" << beg << ", " << end << "]"; + auto cr = get_chunk_range(inode); + DWARFS_CHECK(cr, "invalid chunk range"); + os << " [" << cr->begin_ << ", " << cr->end_ << "]"; os << " " << file_size(inode_data, mode) << "\n"; if (detail_level > 3) { icb(indent + " ", inode); @@ -581,7 +592,7 @@ folly::dynamic metadata_::as_dynamic(dir_entry_view entry) const { auto inode_data = entry.inode(); auto mode = inode_data.mode(); - auto inode = inode_data.content_index(); // TODO: rename all the things + auto inode = inode_data.inode_num(); // TODO: rename all the things obj["mode"] = mode; obj["modestring"] = modestring(mode); @@ -674,7 +685,7 @@ void metadata_::walk(uint32_t self_index, uint32_t parent_index, auto inode_data = entry.inode(); if (S_ISDIR(inode_data.mode())) { - auto inode = inode_data.content_index(); + auto inode = inode_data.inode_num(); if (!seen.emplace(inode).second) { DWARFS_THROW(runtime_error, "cycle detected during directory walk"); @@ -710,8 +721,8 @@ void metadata_::walk_inode_order_impl( } else { std::sort(entries.begin(), entries.end(), [this](auto const& a, auto const& b) { - return meta_.entries()[a.first].content_index() < - meta_.entries()[b.first].content_index(); + return meta_.entries()[a.first].inode_v2_2() < + meta_.entries()[b.first].inode_v2_2(); }); } @@ -905,7 +916,7 @@ int metadata_::access(inode_view entry, int mode, uid_t uid, template int metadata_::open(inode_view entry) const { if (S_ISREG(entry.mode())) { - return entry.content_index(); + return entry.inode_num(); } return -1; @@ -949,15 +960,7 @@ int metadata_::statvfs(struct ::statvfs* stbuf) const { template std::optional metadata_::get_chunks(int inode) const { - std::optional rv; - inode -= inode_offset_ + file_index_offset_; - if (inode >= 0 && - inode < (static_cast(meta_.chunk_table().size()) - 1)) { - uint32_t begin = meta_.chunk_table()[inode]; - uint32_t end = meta_.chunk_table()[inode + 1]; - rv = chunk_range(&meta_, begin, end); - } - return rv; + return get_chunk_range(inode - inode_offset_); } void metadata_v2::get_stat_defaults(struct ::stat* defaults) { diff --git a/src/dwarfs/scanner.cpp b/src/dwarfs/scanner.cpp index 77705e1e..b5e469d2 100644 --- a/src/dwarfs/scanner.cpp +++ b/src/dwarfs/scanner.cpp @@ -213,7 +213,7 @@ class pipe_set_inode_visitor : public visitor_base { uint32_t& inode_num_; }; -class names_and_symlinks_visitor : public entry_visitor { +class names_and_symlinks_visitor : public visitor_base { public: explicit names_and_symlinks_visitor(global_entry_data& data) : data_(data) {} @@ -264,6 +264,24 @@ class save_directories_visitor : public visitor_base { std::vector directories_; }; +class save_unique_files_visitor : public visitor_base { + public: + explicit save_unique_files_visitor(uint32_t inode_begin, uint32_t inode_end) + : inode_begin_{inode_begin} { + unique_files_.resize(inode_end - inode_begin); + } + + void visit(file* p) override { + unique_files_.at(p->inode_num() - inode_begin_) = p->unique_file_id(); + } + + std::vector& get_unique_files() { return unique_files_; } + + private: + uint32_t const inode_begin_; + std::vector unique_files_; +}; + std::string status_string(progress const& p, size_t width) { auto cp = p.current.load(); std::string label, path; @@ -551,7 +569,7 @@ void scanner_::scan(filesystem_writer& fsw, worker_group blockify("blockify", 1, 1 << 20); - im.order_inodes(script_, options_.file_order, first_file_inode, + im.order_inodes(script_, options_.file_order, 0, [&](std::shared_ptr const& ino) { blockify.add_job([&] { prog.current.store(ino.get()); @@ -587,15 +605,15 @@ void scanner_::scan(filesystem_writer& fsw, // TODO: we should be able to start this once all blocks have been // submitted for compression im.for_each_inode([&](std::shared_ptr const& ino) { - DWARFS_NOTHROW(mv2.chunk_table.at(ino->num() - first_file_inode)) = - mv2.chunks.size(); + // TODO: no need for this offset stuff here... + DWARFS_NOTHROW(mv2.chunk_table.at(ino->num())) = mv2.chunks.size(); ino->append_chunks_to(mv2.chunks); }); // insert dummy inode to help determine number of chunks per inode DWARFS_NOTHROW(mv2.chunk_table.at(im.count())) = mv2.chunks.size(); - LOG_DEBUG << "total number of file inodes: " << im.count(); + LOG_DEBUG << "total number of unique files: " << im.count(); LOG_DEBUG << "total number of chunks: " << mv2.chunks.size(); LOG_INFO << "saving directories..."; @@ -606,6 +624,11 @@ void scanner_::scan(filesystem_writer& fsw, root->accept(sdv); sdv.pack(mv2, ge_data); + LOG_INFO << "saving unique files table..."; + save_unique_files_visitor sufv(first_file_inode, first_device_inode); + root->accept(sufv); + mv2.unique_files_table_ref() = std::move(sufv.get_unique_files()); + thrift::metadata::fs_options fsopts; fsopts.mtime_only = !options_.keep_all_times; if (options_.time_resolution_sec > 1) { diff --git a/test/dwarfs.cpp b/test/dwarfs.cpp index 43eaf71e..2b8700bf 100644 --- a/test/dwarfs.cpp +++ b/test/dwarfs.cpp @@ -403,7 +403,7 @@ void basic_end_to_end_test(std::string const& compressor, auto e2 = fs.find("/bar.pl"); ASSERT_TRUE(e2); - EXPECT_EQ(entry->content_index(), e2->content_index()); + EXPECT_EQ(entry->inode_num(), e2->inode_num()); struct ::stat st1, st2; ASSERT_EQ(0, fs.getattr(*entry, &st1)); @@ -417,13 +417,13 @@ void basic_end_to_end_test(std::string const& compressor, entry = fs.find("/"); ASSERT_TRUE(entry); - EXPECT_EQ(0, entry->content_index()); + EXPECT_EQ(0, entry->inode_num()); e2 = fs.find(0); ASSERT_TRUE(e2); - EXPECT_EQ(e2->content_index(), 0); + EXPECT_EQ(e2->inode_num(), 0); entry = fs.find(0, "baz.pl"); ASSERT_TRUE(entry); - EXPECT_GT(entry->content_index(), 0); + EXPECT_GT(entry->inode_num(), 0); ASSERT_EQ(0, fs.getattr(*entry, &st1)); EXPECT_EQ(23456, st1.st_size); e2 = fs.find(0, "somedir"); diff --git a/thrift/metadata.thrift b/thrift/metadata.thrift index 62c59c14..41815f07 100644 --- a/thrift/metadata.thrift +++ b/thrift/metadata.thrift @@ -74,7 +74,7 @@ struct inode_data { * - For files, (inode - chunk_index_offset) can be * used as in index into metadata.chunk_table. */ - 3: required UInt32 content_index, + 3: required UInt32 inode_v2_2, //-------------------------------------------------------------------------- // TODO: actually, the inode field is redundant as of v2.3, as entries are @@ -227,6 +227,11 @@ struct metadata { */ 19: optional list dir_entries, + /** + * Maps from file inode to chunk_table index + */ + 20: optional list unique_files_table, + // TODO: add timestamp - // 20: optional UInt64 timestamp, + // 21: optional UInt64 timestamp, }