diff --git a/CMakeLists.txt b/CMakeLists.txt index f3c11828..1a5cc14c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -245,6 +245,7 @@ list( src/dwarfs/progress.cpp src/dwarfs/scanner.cpp src/dwarfs/similarity.cpp + src/dwarfs/string_table.cpp src/dwarfs/terminal.cpp src/dwarfs/util.cpp src/dwarfs/version.cpp diff --git a/include/dwarfs/filesystem_v2.h b/include/dwarfs/filesystem_v2.h index 3046aa56..c474ae2a 100644 --- a/include/dwarfs/filesystem_v2.h +++ b/include/dwarfs/filesystem_v2.h @@ -29,7 +29,6 @@ #include #include #include -#include #include #include @@ -109,7 +108,7 @@ class filesystem_v2 { return impl_->opendir(entry); } - std::optional> + std::optional> readdir(directory_view dir, size_t offset) const { return impl_->readdir(dir, offset); } @@ -120,7 +119,7 @@ class filesystem_v2 { return impl_->readlink(entry, buf); } - folly::Expected readlink(inode_view entry) const { + folly::Expected readlink(inode_view entry) const { return impl_->readlink(entry); } @@ -161,11 +160,11 @@ class filesystem_v2 { virtual int access(inode_view entry, int mode, uid_t uid, gid_t gid) const = 0; virtual std::optional opendir(inode_view entry) const = 0; - virtual std::optional> + virtual std::optional> readdir(directory_view dir, size_t offset) const = 0; virtual size_t dirsize(directory_view dir) const = 0; virtual int readlink(inode_view entry, std::string* buf) const = 0; - virtual folly::Expected + virtual folly::Expected readlink(inode_view entry) const = 0; virtual int statvfs(struct ::statvfs* stbuf) const = 0; virtual int open(inode_view entry) const = 0; diff --git a/include/dwarfs/metadata_types.h b/include/dwarfs/metadata_types.h index ccdac336..6b2bd311 100644 --- a/include/dwarfs/metadata_types.h +++ b/include/dwarfs/metadata_types.h @@ -24,7 +24,6 @@ #include #include #include -#include #include #include @@ -32,6 +31,8 @@ #include +#include "dwarfs/string_table.h" + #include "dwarfs/gen-cpp2/metadata_layouts.h" namespace dwarfs { @@ -41,6 +42,27 @@ class metadata_; class dir_entry_view; +class global_metadata { + public: + using Meta = + ::apache::thrift::frozen::MappedFrozen; + + global_metadata(Meta const* meta); + + Meta const* meta() const { return meta_; } + + uint32_t first_dir_entry(uint32_t ino) const; + uint32_t parent_dir_entry(uint32_t ino) const; + + string_table const& names() const { return names_; } + + private: + Meta const* const meta_; + std::vector const directories_storage_; + thrift::metadata::directory const* const directories_; + string_table const names_; +}; + class inode_view : public ::apache::thrift::frozen::View { using InodeView = @@ -89,18 +111,15 @@ class directory_view { boost::integer_range entry_range() const; private: - directory_view(uint32_t inode, Meta const* meta, - thrift::metadata::directory const* directories = nullptr) + directory_view(uint32_t inode, global_metadata const* g) : inode_{inode} - , directories_{directories} - , meta_{meta} {} + , g_{g} {} uint32_t first_entry(uint32_t ino) const; uint32_t parent_entry(uint32_t ino) const; uint32_t inode_; - thrift::metadata::directory const* directories_; - Meta const* meta_; + global_metadata const* g_; }; class dir_entry_view { @@ -108,14 +127,12 @@ class dir_entry_view { ::apache::thrift::frozen::View; using DirEntryView = ::apache::thrift::frozen::View; - using Meta = - ::apache::thrift::frozen::MappedFrozen; template friend class metadata_; public: - std::string_view name() const; + std::string name() const; inode_view inode() const; bool is_root() const; @@ -129,33 +146,33 @@ class dir_entry_view { private: dir_entry_view(DirEntryView v, uint32_t self_index, uint32_t parent_index, - Meta const* meta) + global_metadata const* g) : v_{v} , self_index_{self_index} , parent_index_{parent_index} - , meta_{meta} {} + , g_{g} {} dir_entry_view(InodeView v, uint32_t self_index, uint32_t parent_index, - Meta const* meta) + global_metadata const* g) : v_{v} , self_index_{self_index} , parent_index_{parent_index} - , meta_{meta} {} + , g_{g} {} static dir_entry_view from_dir_entry_index(uint32_t self_index, uint32_t parent_index, - Meta const* meta); + global_metadata const* g); static dir_entry_view - from_dir_entry_index(uint32_t self_index, Meta const* meta); + from_dir_entry_index(uint32_t self_index, global_metadata const* g); // TODO: this works, but it's strange; a limited version of dir_entry_view // should work without a parent for these use cases - static std::string_view name(uint32_t index, Meta const* meta); - static inode_view inode(uint32_t index, Meta const* meta); + static std::string name(uint32_t index, global_metadata const* g); + static inode_view inode(uint32_t index, global_metadata const* g); std::variant v_; uint32_t self_index_, parent_index_; - Meta const* meta_; + global_metadata const* g_; }; using chunk_view = ::apache::thrift::frozen::View; diff --git a/include/dwarfs/metadata_v2.h b/include/dwarfs/metadata_v2.h index 0ec7ffe0..d7afee43 100644 --- a/include/dwarfs/metadata_v2.h +++ b/include/dwarfs/metadata_v2.h @@ -28,7 +28,6 @@ #include #include #include -#include #include #include @@ -104,7 +103,7 @@ class metadata_v2 { return impl_->opendir(iv); } - std::optional> + std::optional> readdir(directory_view dir, size_t offset) const { return impl_->readdir(dir, offset); } @@ -121,7 +120,7 @@ class metadata_v2 { return impl_->readlink(iv, buf); } - folly::Expected readlink(inode_view iv) const { + folly::Expected readlink(inode_view iv) const { return impl_->readlink(iv); } @@ -165,7 +164,7 @@ class metadata_v2 { virtual std::optional opendir(inode_view iv) const = 0; - virtual std::optional> + virtual std::optional> readdir(directory_view dir, size_t offset) const = 0; virtual size_t dirsize(directory_view dir) const = 0; @@ -176,8 +175,7 @@ class metadata_v2 { virtual int readlink(inode_view iv, std::string* buf) const = 0; - virtual folly::Expected - readlink(inode_view iv) const = 0; + virtual folly::Expected readlink(inode_view iv) const = 0; virtual int statvfs(struct ::statvfs* stbuf) const = 0; diff --git a/include/dwarfs/options.h b/include/dwarfs/options.h index eea4e4c4..0e9d158c 100644 --- a/include/dwarfs/options.h +++ b/include/dwarfs/options.h @@ -77,6 +77,13 @@ struct scanner_options { bool pack_chunk_table{false}; bool pack_directories{false}; bool pack_shared_files_table{false}; + bool plain_names_table{false}; + bool pack_names{false}; + bool pack_names_index{false}; + bool plain_symlinks_table{false}; + bool pack_symlinks{false}; + bool pack_symlinks_index{false}; + bool force_pack_string_tables{false}; }; struct rewrite_options { diff --git a/include/dwarfs/string_table.h b/include/dwarfs/string_table.h new file mode 100644 index 00000000..394f4552 --- /dev/null +++ b/include/dwarfs/string_table.h @@ -0,0 +1,71 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#pragma once + +#include +#include +#include + +#include "dwarfs/gen-cpp2/metadata_layouts.h" + +namespace dwarfs { + +class string_table { + public: + using LegacyTableView = + ::apache::thrift::frozen::View>; + using PackedTableView = + ::apache::thrift::frozen::View; + + struct pack_options { + pack_options(bool pack_data = true, bool pack_index = true, + bool force_pack_data = false) + : pack_data{pack_data} + , pack_index{pack_index} + , force_pack_data{force_pack_data} {} + + bool pack_data; + bool pack_index; + bool force_pack_data; + }; + + string_table(PackedTableView v); + string_table(LegacyTableView v); + + std::string operator[](size_t index) const { return impl_->lookup(index); } + + static thrift::metadata::string_table + pack(std::vector const& input, + pack_options const& options = pack_options()); + + class impl { + public: + virtual ~impl() = default; + + virtual std::string lookup(size_t index) const = 0; + }; + + private: + std::unique_ptr impl_; +}; + +} // namespace dwarfs diff --git a/src/dwarfs/filesystem_v2.cpp b/src/dwarfs/filesystem_v2.cpp index aa9f6e2e..17c76342 100644 --- a/src/dwarfs/filesystem_v2.cpp +++ b/src/dwarfs/filesystem_v2.cpp @@ -190,12 +190,11 @@ class filesystem_ final : public filesystem_v2::impl { int getattr(inode_view entry, struct ::stat* stbuf) const override; int access(inode_view entry, int mode, uid_t uid, gid_t gid) const override; std::optional opendir(inode_view entry) const override; - std::optional> + std::optional> readdir(directory_view dir, size_t offset) const override; size_t dirsize(directory_view dir) const override; int readlink(inode_view entry, std::string* buf) const override; - folly::Expected - readlink(inode_view entry) const override; + folly::Expected readlink(inode_view entry) const override; int statvfs(struct ::statvfs* stbuf) const override; int open(inode_view entry) const override; ssize_t @@ -325,7 +324,7 @@ filesystem_::opendir(inode_view entry) const { } template -std::optional> +std::optional> filesystem_::readdir(directory_view dir, size_t offset) const { return meta_.readdir(dir, offset); } @@ -342,7 +341,7 @@ int filesystem_::readlink(inode_view entry, } template -folly::Expected +folly::Expected filesystem_::readlink(inode_view entry) const { return meta_.readlink(entry); } diff --git a/src/dwarfs/metadata_types.cpp b/src/dwarfs/metadata_types.cpp index 4e666017..e5f07f26 100644 --- a/src/dwarfs/metadata_types.cpp +++ b/src/dwarfs/metadata_types.cpp @@ -19,14 +19,84 @@ * along with dwarfs. If not, see . */ -#include "dwarfs/metadata_types.h" +#include + #include "dwarfs/error.h" +#include "dwarfs/metadata_types.h" #include "dwarfs/overloaded.h" #include "dwarfs/gen-cpp2/metadata_types_custom_protocol.h" namespace dwarfs { +namespace { + +std::vector +unpack_directories(global_metadata::Meta const* meta) { + std::vector directories; + + if (auto opts = meta->options(); opts and opts->packed_directories()) { + auto dirent = *meta->dir_entries(); + auto metadir = meta->directories(); + + { + directories.resize(metadir.size()); + + // delta-decode first entries first + directories[0].first_entry = metadir[0].first_entry(); + + for (size_t i = 1; i < directories.size(); ++i) { + directories[i].first_entry = + directories[i - 1].first_entry + metadir[i].first_entry(); + } + + // then traverse to recover parent entries + std::queue queue; + queue.push(0); + + while (!queue.empty()) { + auto parent = queue.front(); + queue.pop(); + + auto p_ino = dirent[parent].inode_num(); + + auto beg = directories[p_ino].first_entry; + auto end = directories[p_ino + 1].first_entry; + + for (auto e = beg; e < end; ++e) { + if (auto e_ino = dirent[e].inode_num(); + e_ino < (directories.size() - 1)) { + directories[e_ino].parent_entry = parent; + queue.push(e); + } + } + } + } + } + + return directories; +} + +} // namespace + +global_metadata::global_metadata(Meta const* meta) + : meta_{meta} + , directories_storage_{unpack_directories(meta_)} + , directories_{directories_storage_.empty() ? nullptr + : directories_storage_.data()} + , names_{meta_->compact_names() ? string_table(*meta_->compact_names()) + : string_table(meta_->names())} {} + +uint32_t global_metadata::first_dir_entry(uint32_t ino) const { + return directories_ ? directories_[ino].first_entry + : meta_->directories()[ino].first_entry(); +} + +uint32_t global_metadata::parent_dir_entry(uint32_t ino) const { + return directories_ ? directories_[ino].parent_entry + : meta_->directories()[ino].parent_entry(); +} + uint16_t inode_view::mode() const { return meta_->modes()[mode_index()]; } uint16_t inode_view::getuid() const { return meta_->uids()[owner_index()]; } @@ -35,13 +105,14 @@ uint16_t inode_view::getgid() const { return meta_->gids()[group_index()]; } // TODO: pretty certain some of this stuff can be simplified -std::string_view dir_entry_view::name() const { +std::string dir_entry_view::name() const { return std::visit(overloaded{ [this](DirEntryView const& dev) { - return meta_->names()[dev.name_index()]; + return g_->names()[dev.name_index()]; }, [this](InodeView const& iv) { - return meta_->names()[iv.name_index_v2_2()]; + return std::string( + g_->meta()->names()[iv.name_index_v2_2()]); }, }, v_); @@ -50,11 +121,12 @@ std::string_view dir_entry_view::name() const { inode_view dir_entry_view::inode() const { return std::visit(overloaded{ [this](DirEntryView const& dev) { - return inode_view(meta_->inodes()[dev.inode_num()], - dev.inode_num(), meta_); + return inode_view( + g_->meta()->inodes()[dev.inode_num()], + dev.inode_num(), g_->meta()); }, [this](InodeView const& iv) { - return inode_view(iv, iv.inode_v2_2(), meta_); + return inode_view(iv, iv.inode_v2_2(), g_->meta()); }, }, v_); @@ -77,14 +149,16 @@ bool dir_entry_view::is_root() const { dir_entry_view dir_entry_view::from_dir_entry_index(uint32_t self_index, uint32_t parent_index, - Meta const* meta) { + global_metadata const* g) { + auto meta = g->meta(); + if (auto de = meta->dir_entries()) { DWARFS_CHECK(self_index < de->size(), "self_index out of range"); DWARFS_CHECK(parent_index < de->size(), "parent_index out of range"); auto dev = (*de)[self_index]; - return dir_entry_view(dev, self_index, parent_index, meta); + return dir_entry_view(dev, self_index, parent_index, g); } DWARFS_CHECK(self_index < meta->inodes().size(), "self_index out of range"); @@ -92,19 +166,20 @@ dir_entry_view::from_dir_entry_index(uint32_t self_index, uint32_t parent_index, auto iv = meta->inodes()[self_index]; - return dir_entry_view(iv, self_index, parent_index, meta); + return dir_entry_view(iv, self_index, parent_index, g); } -dir_entry_view -dir_entry_view::from_dir_entry_index(uint32_t self_index, Meta const* meta) { +dir_entry_view dir_entry_view::from_dir_entry_index(uint32_t self_index, + global_metadata const* g) { + auto meta = g->meta(); + if (auto de = meta->dir_entries()) { DWARFS_CHECK(self_index < de->size(), "self_index out of range"); auto dev = (*de)[self_index]; DWARFS_CHECK(dev.inode_num() < meta->directories().size(), "self_index inode out of range"); - return dir_entry_view(dev, self_index, - meta->directories()[dev.inode_num()].parent_entry(), - meta); + return dir_entry_view(dev, self_index, g->parent_dir_entry(dev.inode_num()), + g); } DWARFS_CHECK(self_index < meta->inodes().size(), "self_index out of range"); @@ -116,7 +191,7 @@ dir_entry_view::from_dir_entry_index(uint32_t self_index, Meta const* meta) { iv, self_index, meta->entry_table_v2_2()[meta->directories()[iv.inode_v2_2()] .parent_entry()], - meta); + g); } std::optional dir_entry_view::parent() const { @@ -124,31 +199,32 @@ std::optional dir_entry_view::parent() const { return std::nullopt; } - return from_dir_entry_index(parent_index_, meta_); + return from_dir_entry_index(parent_index_, g_); } -std::string_view dir_entry_view::name(uint32_t index, Meta const* meta) { - if (auto de = meta->dir_entries()) { +std::string dir_entry_view::name(uint32_t index, global_metadata const* g) { + if (auto de = g->meta()->dir_entries()) { DWARFS_CHECK(index < de->size(), "index out of range"); auto dev = (*de)[index]; - return meta->names()[dev.name_index()]; + return g->names()[dev.name_index()]; } - DWARFS_CHECK(index < meta->inodes().size(), "index out of range"); - auto iv = meta->inodes()[index]; - return meta->names()[iv.name_index_v2_2()]; + DWARFS_CHECK(index < g->meta()->inodes().size(), "index out of range"); + auto iv = g->meta()->inodes()[index]; + return std::string(g->meta()->names()[iv.name_index_v2_2()]); } -inode_view dir_entry_view::inode(uint32_t index, Meta const* meta) { - if (auto de = meta->dir_entries()) { +inode_view dir_entry_view::inode(uint32_t index, global_metadata const* g) { + if (auto de = g->meta()->dir_entries()) { DWARFS_CHECK(index < de->size(), "index out of range"); auto dev = (*de)[index]; - return inode_view(meta->inodes()[dev.inode_num()], dev.inode_num(), meta); + return inode_view(g->meta()->inodes()[dev.inode_num()], dev.inode_num(), + g->meta()); } - DWARFS_CHECK(index < meta->inodes().size(), "index out of range"); - auto iv = meta->inodes()[index]; - return inode_view(iv, iv.inode_v2_2(), meta); + DWARFS_CHECK(index < g->meta()->inodes().size(), "index out of range"); + auto iv = g->meta()->inodes()[index]; + return inode_view(iv, iv.inode_v2_2(), g->meta()); } std::string dir_entry_view::path() const { @@ -170,13 +246,11 @@ void dir_entry_view::append_path_to(std::string& s) const { } uint32_t directory_view::first_entry(uint32_t ino) const { - return directories_ ? directories_[ino].first_entry - : meta_->directories()[ino].first_entry(); + return g_->first_dir_entry(ino); } uint32_t directory_view::parent_entry(uint32_t ino) const { - return directories_ ? directories_[ino].parent_entry - : meta_->directories()[ino].parent_entry(); + return g_->parent_dir_entry(ino); } uint32_t directory_view::entry_count() const { @@ -194,7 +268,7 @@ uint32_t directory_view::parent_inode() const { auto ent = parent_entry(inode_); - if (auto e = meta_->dir_entries()) { + if (auto e = g_->meta()->dir_entries()) { ent = (*e)[ent].inode_num(); } diff --git a/src/dwarfs/metadata_v2.cpp b/src/dwarfs/metadata_v2.cpp index b314e406..54ddf092 100644 --- a/src/dwarfs/metadata_v2.cpp +++ b/src/dwarfs/metadata_v2.cpp @@ -27,7 +27,6 @@ #include #include #include -#include #include #include @@ -45,10 +44,13 @@ #include +#include + #include "dwarfs/error.h" #include "dwarfs/logger.h" #include "dwarfs/metadata_v2.h" #include "dwarfs/options.h" +#include "dwarfs/string_table.h" #include "dwarfs/util.h" #include "dwarfs/gen-cpp2/metadata_layouts.h" @@ -104,8 +106,6 @@ void analyze_frozen(std::ostream& os, auto layout = meta.findFirstOfType< std::unique_ptr>>(); - os << "metadata memory usage:\n"; - auto& l = *layout; std::vector> usage; @@ -139,13 +139,33 @@ void analyze_frozen(std::ostream& os, auto add_string_list_size = [&](auto const& name, auto const& list, auto const& field) { auto count = list.size(); - auto index_size = list_size(list, field); - auto data_size = list.back().end() - list.front().begin(); - auto size = index_size + data_size; - auto fmt = fmt_size(name, count, size) + - fmt_detail("|- index", count, index_size) + - fmt_detail("'- data", count, data_size); - usage.emplace_back(size, fmt); + if (count > 0) { + auto index_size = list_size(list, field); + auto data_size = list.back().end() - list.front().begin(); + auto size = index_size + data_size; + auto fmt = fmt_size(name, count, size) + + fmt_detail("|- data", count, data_size) + + fmt_detail("'- index", count, index_size); + usage.emplace_back(size, fmt); + } + }; + + auto add_string_table_size = [&](auto const& name, auto const& table, + auto const& field) { + if (auto data_size = table.buffer().size(); data_size > 0) { + auto dict_size = + table.symtab() ? table.symtab()->size() : static_cast(0); + auto index_size = list_size(table.index(), field.layout.indexField); + auto size = index_size + data_size + dict_size; + auto count = table.index().size() - (table.packed_index() ? 0 : 1); + auto fmt = + fmt_size(name, count, size) + fmt_detail("|- data", count, data_size); + if (table.symtab()) { + fmt += fmt_detail("|- dict", count, dict_size); + } + fmt += fmt_detail("'- index", count, index_size); + usage.emplace_back(size, fmt); + } }; #define META_LIST_SIZE(x) add_list_size(#x, meta.x(), l->x##Field) @@ -159,6 +179,13 @@ void analyze_frozen(std::ostream& os, } \ } while (0) +#define META_OPT_STRING_TABLE_SIZE(x) \ + do { \ + if (auto table = meta.x()) { \ + add_string_table_size(#x, *table, l->x##Field.layout.valueField); \ + } \ + } while (0) + META_LIST_SIZE(chunks); META_LIST_SIZE(directories); META_LIST_SIZE(inodes); @@ -172,17 +199,27 @@ void analyze_frozen(std::ostream& os, META_OPT_LIST_SIZE(dir_entries); META_OPT_LIST_SIZE(shared_files_table); + META_OPT_STRING_TABLE_SIZE(compact_names); + META_OPT_STRING_TABLE_SIZE(compact_symlinks); + META_STRING_LIST_SIZE(names); META_STRING_LIST_SIZE(symlinks); #undef META_LIST_SIZE #undef META_STRING_LIST_SIZE #undef META_OPT_LIST_SIZE +#undef META_OPT_STRING_TABLE_SIZE std::sort(usage.begin(), usage.end(), [](auto const& a, auto const& b) { return a.first > b.first || (a.first == b.first && a.second < b.second); }); + os << "metadata memory usage:\n"; + os << fmt::format( + " {0:.<20}{1:.>13L} bytes {2:6.1f} bytes/inode\n", + "total metadata", total_size, + static_cast(total_size) / meta.inodes().size()); + for (auto const& u : usage) { os << u.second; } @@ -204,7 +241,8 @@ class metadata_ final : public metadata_v2::impl { metadata_options const& options, int inode_offset) : data_(data) , meta_(map_frozen(schema, data_)) - , root_(dir_entry_view::from_dir_entry_index(0, &meta_)) + , global_(&meta_) + , root_(dir_entry_view::from_dir_entry_index(0, &global_)) , log_(lgr) , inode_offset_(inode_offset) , symlink_inode_offset_(find_inode_offset(inode_rank::INO_LNK)) @@ -214,9 +252,6 @@ class metadata_ final : public metadata_v2::impl { : meta_.entry_table_v2_2().size()) , nlinks_(build_nlinks(options)) , chunk_table_(unpack_chunk_table()) - , directories_storage_(unpack_directories()) - , directories_(directories_storage_.empty() ? nullptr - : directories_storage_.data()) , shared_files_(decompress_shared_files()) , unique_files_(dev_inode_offset_ - file_inode_offset_ - (shared_files_.empty() @@ -224,7 +259,10 @@ class metadata_ final : public metadata_v2::impl { ? meta_.shared_files_table()->size() : 0 : shared_files_.size())) - , options_(options) { + , options_(options) + , symlinks_(meta_.compact_symlinks() + ? string_table(*meta_.compact_symlinks()) + : string_table(meta_.symlinks())) { if (static_cast(meta_.directories().size() - 1) != symlink_inode_offset_) { DWARFS_THROW( @@ -304,7 +342,7 @@ class metadata_ final : public metadata_v2::impl { std::optional opendir(inode_view iv) const override; - std::optional> + std::optional> readdir(directory_view dir, size_t offset) const override; size_t dirsize(directory_view dir) const override { @@ -317,7 +355,7 @@ class metadata_ final : public metadata_v2::impl { int readlink(inode_view iv, std::string* buf) const override; - folly::Expected readlink(inode_view iv) const override; + folly::Expected readlink(inode_view iv) const override; int statvfs(struct ::statvfs* stbuf) const override; @@ -339,7 +377,7 @@ class metadata_ final : public metadata_v2::impl { dir_entry_view make_dir_entry_view(uint32_t self_index, uint32_t parent_index) const { return dir_entry_view::from_dir_entry_index(self_index, parent_index, - &meta_); + &global_); } // This represents the order in which inodes are stored in inodes @@ -420,7 +458,7 @@ class metadata_ final : public metadata_v2::impl { directory_view make_directory_view(inode_view iv) const { // TODO: revisit: is this the way to do it? - return directory_view(iv.inode_num(), &meta_, directories_); + return directory_view(iv.inode_num(), &global_); } // TODO: see if we really need to pass the extra dir_entry_view in @@ -525,9 +563,9 @@ class metadata_ final : public metadata_v2::impl { return rv; } - std::string_view link_value(inode_view iv) const { - return meta_.symlinks()[meta_.symlink_table()[iv.inode_num() - - symlink_inode_offset_]]; + std::string link_value(inode_view iv) const { + return symlinks_[meta_.symlink_table()[iv.inode_num() - + symlink_inode_offset_]]; } uint64_t get_device_id(int inode) const { @@ -550,58 +588,6 @@ class metadata_ final : public metadata_v2::impl { return chunk_table; } - std::vector unpack_directories() const { - std::vector directories; - - if (auto opts = meta_.options(); opts and opts->packed_directories()) { - auto dirent = *meta_.dir_entries(); - auto metadir = meta_.directories(); - - { - auto ti = LOG_TIMED_DEBUG; - - directories.resize(metadir.size()); - - // delta-decode first entries first - directories[0].first_entry = metadir[0].first_entry(); - - for (size_t i = 1; i < directories.size(); ++i) { - directories[i].first_entry = - directories[i - 1].first_entry + metadir[i].first_entry(); - } - - // then traverse to recover parent entries - std::queue queue; - queue.push(0); - - while (!queue.empty()) { - auto parent = queue.front(); - queue.pop(); - - auto p_ino = dirent[parent].inode_num(); - - auto beg = directories[p_ino].first_entry; - auto end = directories[p_ino + 1].first_entry; - - for (auto e = beg; e < end; ++e) { - if (auto e_ino = dirent[e].inode_num(); - e_ino < (directories.size() - 1)) { - directories[e_ino].parent_entry = parent; - queue.push(e); - } - } - } - - ti << "unpacked directories table (" - << size_with_unit(sizeof(directories.front()) * - directories.capacity()) - << ")"; - } - } - - return directories; - } - std::vector decompress_shared_files() const { std::vector decompressed; @@ -664,6 +650,7 @@ class metadata_ final : public metadata_v2::impl { folly::ByteRange data_; MappedFrozen meta_; + const global_metadata global_; dir_entry_view root_; log_proxy log_; const int inode_offset_; @@ -673,11 +660,10 @@ class metadata_ final : public metadata_v2::impl { const int inode_count_; const std::vector nlinks_; const std::vector chunk_table_; - const std::vector directories_storage_; - thrift::metadata::directory const* const directories_; const std::vector shared_files_; const int unique_files_; const metadata_options options_; + const string_table symlinks_; }; template @@ -770,6 +756,14 @@ void metadata_::dump( boolopt("packed_chunk_table", opt->packed_chunk_table()); boolopt("packed_directories", opt->packed_directories()); boolopt("packed_shared_files_table", opt->packed_shared_files_table()); + if (auto names = meta_.compact_names()) { + boolopt("packed_names", static_cast(names->symtab())); + boolopt("packed_names_index", names->packed_index()); + } + if (auto symlinks = meta_.compact_symlinks()) { + boolopt("packed_symlinks", static_cast(symlinks->symtab())); + boolopt("packed_symlinks_index", symlinks->packed_index()); + } os << "options: " << boost::join(options, "\n ") << std::endl; if (auto res = opt->time_resolution_sec()) { os << "time resolution: " << *res << " seconds" << std::endl; @@ -1041,14 +1035,14 @@ metadata_::find(directory_view dir, std::string_view name) const { auto it = std::lower_bound(range.begin(), range.end(), name, [&](auto ix, std::string_view name) { - return dir_entry_view::name(ix, &meta_) < name; + return dir_entry_view::name(ix, &global_) < name; }); std::optional rv; if (it != range.end()) { - if (dir_entry_view::name(*it, &meta_) == name) { - rv = dir_entry_view::inode(*it, &meta_); + if (dir_entry_view::name(*it, &global_) == name) { + rv = dir_entry_view::inode(*it, &global_); } } @@ -1157,7 +1151,7 @@ metadata_::opendir(inode_view iv) const { } template -std::optional> +std::optional> metadata_::readdir(directory_view dir, size_t offset) const { switch (offset) { case 0: @@ -1174,8 +1168,8 @@ metadata_::readdir(directory_view dir, size_t offset) const { } auto index = dir.first_entry() + offset; - auto inode = dir_entry_view::inode(index, &meta_); - return std::pair(inode, dir_entry_view::name(index, &meta_)); + auto inode = dir_entry_view::inode(index, &global_); + return std::pair(inode, dir_entry_view::name(index, &global_)); } return std::nullopt; @@ -1235,7 +1229,7 @@ int metadata_::readlink(inode_view iv, std::string* buf) const { } template -folly::Expected +folly::Expected metadata_::readlink(inode_view iv) const { if (S_ISLNK(iv.mode())) { return link_value(iv); diff --git a/src/dwarfs/scanner.cpp b/src/dwarfs/scanner.cpp index 542a903c..d64d5240 100644 --- a/src/dwarfs/scanner.cpp +++ b/src/dwarfs/scanner.cpp @@ -55,6 +55,7 @@ #include "dwarfs/progress.h" #include "dwarfs/scanner.h" #include "dwarfs/script.h" +#include "dwarfs/string_table.h" #include "dwarfs/util.h" #include "dwarfs/version.h" #include "dwarfs/worker_group.h" @@ -740,11 +741,32 @@ void scanner_::scan(filesystem_writer& fsw, fsopts.packed_directories = options_.pack_directories; fsopts.packed_shared_files_table = options_.pack_shared_files_table; + if (options_.plain_names_table) { + mv2.names = ge_data.get_names(); + } else { + auto ti = LOG_TIMED_INFO; + mv2.set_compact_names(string_table::pack( + ge_data.get_names(), string_table::pack_options( + options_.pack_names, options_.pack_names_index, + options_.force_pack_string_tables))); + ti << "saving names table..."; + } + + if (options_.plain_symlinks_table) { + mv2.symlinks = ge_data.get_symlinks(); + } else { + auto ti = LOG_TIMED_INFO; + mv2.set_compact_symlinks(string_table::pack( + ge_data.get_symlinks(), + string_table::pack_options(options_.pack_symlinks, + options_.pack_symlinks_index, + options_.force_pack_string_tables))); + ti << "saving symlinks table..."; + } + mv2.uids = ge_data.get_uids(); mv2.gids = ge_data.get_gids(); mv2.modes = ge_data.get_modes(); - mv2.names = ge_data.get_names(); - mv2.symlinks = ge_data.get_symlinks(); mv2.timestamp_base = ge_data.get_timestamp_base(); mv2.block_size = UINT32_C(1) << cfg_.block_size_bits; mv2.total_fs_size = prog.original_size; diff --git a/src/dwarfs/string_table.cpp b/src/dwarfs/string_table.cpp new file mode 100644 index 00000000..02bcb6b9 --- /dev/null +++ b/src/dwarfs/string_table.cpp @@ -0,0 +1,248 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#include +#include + +#include + +#include + +#include "dwarfs/error.h" +#include "dwarfs/string_table.h" + +namespace dwarfs { + +class legacy_string_table : public string_table::impl { + public: + legacy_string_table(string_table::LegacyTableView v) + : v_{v} {} + + std::string lookup(size_t index) const override { + return std::string(v_[index]); + } + + private: + string_table::LegacyTableView v_; +}; + +template +class packed_string_table : public string_table::impl { + public: + packed_string_table(string_table::PackedTableView v) + : v_{v} + , buffer_{v_.buffer().data()} { + if constexpr (PackedData) { + auto st = v_.symtab(); + DWARFS_CHECK(st, "symtab unexpectedly unset"); + dec_ = std::make_unique(); + + auto read = fsst_import(dec_.get(), reinterpret_cast( + const_cast(st->data()))); + + if (read != st->size()) { + DWARFS_THROW(runtime_error, + fmt::format("read {0} symtab bytes, expected {1}", read, + st->size())); + } + } + + if constexpr (PackedIndex) { + DWARFS_CHECK(v_.packed_index(), "index unexpectedly not packed"); + index_.resize(v_.index().size() + 1); + std::partial_sum(v_.index().begin(), v_.index().end(), + index_.begin() + 1); + } + } + + std::string lookup(size_t index) const override { + auto beg = buffer_; + auto end = buffer_; + + if constexpr (PackedIndex) { + beg += index_[index]; + end += index_[index + 1]; + } else { + beg += v_.index()[index]; + end += v_.index()[index + 1]; + } + + if constexpr (PackedData) { + thread_local std::string out; + size_t size = end - beg; + out.resize(8 * size); + auto outlen = fsst_decompress( + dec_.get(), size, + reinterpret_cast(const_cast(beg)), out.size(), + reinterpret_cast(out.data())); + out.resize(outlen); + return out; + } + + return std::string(beg, end); + } + + private: + string_table::PackedTableView v_; + char const* const buffer_; + std::vector index_; + std::unique_ptr dec_; +}; + +string_table::string_table(LegacyTableView v) + : impl_{std::make_unique(v)} {} + +namespace { + +std::unique_ptr +build_string_table(string_table::PackedTableView v) { + if (v.symtab()) { + if (v.packed_index()) { + return std::make_unique>(v); + } else { + return std::make_unique>(v); + } + } else { + if (v.packed_index()) { + return std::make_unique>(v); + } else { + return std::make_unique>(v); + } + } +} + +} // namespace + +string_table::string_table(PackedTableView v) + : impl_{build_string_table(v)} {} + +thrift::metadata::string_table +string_table::pack(std::vector const& input, + pack_options const& options) { + auto size = input.size(); + bool pack_data = options.pack_data; + size_t total_input_size = 0; + std::string buffer; + std::string symtab; + std::vector out_len_vec; + std::vector out_ptr_vec; + + if (input.empty()) { + pack_data = false; + } + + if (pack_data) { + std::vector len_vec; + std::vector ptr_vec; + + len_vec.reserve(size); + ptr_vec.reserve(size); + + for (auto const& s : input) { + ptr_vec.emplace_back( + reinterpret_cast(const_cast(s.data()))); + len_vec.emplace_back(s.size()); + total_input_size += s.size(); + } + + std::unique_ptr<::fsst_encoder_t, decltype(&::fsst_destroy)> enc{ + ::fsst_create(size, len_vec.data(), ptr_vec.data(), 0), + &::fsst_destroy}; + + symtab.resize(sizeof(::fsst_decoder_t)); + + auto symtab_size = ::fsst_export( + enc.get(), reinterpret_cast(symtab.data())); + symtab.resize(symtab_size); + + if (symtab.size() < total_input_size or options.force_pack_data) { + out_len_vec.resize(size); + out_ptr_vec.resize(size); + + buffer.resize(options.force_pack_data ? total_input_size + : total_input_size - symtab.size()); + size_t num_compressed = 0; + + do { + num_compressed = ::fsst_compress( + enc.get(), size, len_vec.data(), ptr_vec.data(), buffer.size(), + reinterpret_cast(buffer.data()), out_len_vec.data(), + out_ptr_vec.data()); + + if (num_compressed == size) { + break; + } + + buffer.resize(2 * buffer.size()); + } while (options.force_pack_data); + + pack_data = num_compressed == size; + } else { + pack_data = false; + } + } else { + for (auto const& s : input) { + total_input_size += s.size(); + } + } + + thrift::metadata::string_table output; + + if (pack_data) { + // store compressed + size_t compressed_size = + (out_ptr_vec.back() - out_ptr_vec.front()) + out_len_vec.back(); + + DWARFS_CHECK(reinterpret_cast(out_ptr_vec.front()) == buffer.data(), + "string table compression pointer mismatch"); + // TODO: only enable this in debug mode + DWARFS_CHECK(compressed_size == std::accumulate(out_len_vec.begin(), + out_len_vec.end(), + static_cast(0)), + "string table compression pointer mismatch"); + + buffer.resize(compressed_size); + output.buffer.swap(buffer); + output.set_symtab(std::move(symtab)); + output.index.resize(size); + std::copy(out_len_vec.begin(), out_len_vec.end(), output.index.begin()); + } else { + // store uncompressed + output.buffer.reserve(total_input_size); + output.index.reserve(size); + for (auto const& s : input) { + output.buffer += s; + output.index.emplace_back(s.size()); + } + } + + output.packed_index = options.pack_index; + + if (!options.pack_index) { + output.index.insert(output.index.begin(), 0); + std::partial_sum(output.index.begin(), output.index.end(), + output.index.begin()); + } + + return output; +} + +} // namespace dwarfs diff --git a/src/mkdwarfs.cpp b/src/mkdwarfs.cpp index d8f3dc7f..10fdc2de 100644 --- a/src/mkdwarfs.cpp +++ b/src/mkdwarfs.cpp @@ -397,7 +397,9 @@ int mkdwarfs(int argc, char** argv) { "metadata compression algorithm") ("pack-metadata", po::value(&pack_metadata)->default_value("all"), - "pack certain metadata elements (none, chunk_table, directories, shared_files, all)") + "pack certain metadata elements (none, all, chunk_table, " + "directories, shared_files, names, names_index, symlinks, " + "symlinks_index)") ("recompress", po::value(&recompress_opts)->implicit_value("all"), "recompress an existing filesystem (none, block, metadata, all)") @@ -733,6 +735,10 @@ int mkdwarfs(int argc, char** argv) { options.pack_chunk_table = true; options.pack_directories = true; options.pack_shared_files_table = true; + options.pack_names = true; + options.pack_names_index = true; + options.pack_symlinks = true; + options.pack_symlinks_index = true; } else { std::vector pack_opts; boost::split(pack_opts, pack_metadata, boost::is_any_of(",")); @@ -743,6 +749,14 @@ int mkdwarfs(int argc, char** argv) { options.pack_directories = true; } else if (opt == "shared_files") { options.pack_shared_files_table = true; + } else if (opt == "names") { + options.pack_names = true; + } else if (opt == "names_index") { + options.pack_names_index = true; + } else if (opt == "symlinks") { + options.pack_symlinks = true; + } else if (opt == "symlinks_index") { + options.pack_symlinks_index = true; } else { std::cerr << "error: the argument ('" << opt << "') to '--pack-metadata' is invalid" << std::endl; diff --git a/test/dwarfs.cpp b/test/dwarfs.cpp index f4bb5b9d..2a40c404 100644 --- a/test/dwarfs.cpp +++ b/test/dwarfs.cpp @@ -195,8 +195,10 @@ void basic_end_to_end_test(std::string const& compressor, bool with_devices, bool with_specials, bool set_uid, bool set_gid, bool set_time, bool keep_all_times, bool enable_nlink, bool pack_chunk_table, - bool pack_directories, - bool pack_shared_files_table) { + bool pack_directories, bool pack_shared_files_table, + bool pack_names, bool pack_names_index, + bool pack_symlinks, bool pack_symlinks_index, + bool plain_names_table, bool plain_symlinks_table) { block_manager::config cfg; scanner_options options; @@ -212,6 +214,13 @@ void basic_end_to_end_test(std::string const& compressor, options.pack_chunk_table = pack_chunk_table; options.pack_directories = pack_directories; options.pack_shared_files_table = pack_shared_files_table; + options.pack_names = pack_names; + options.pack_names_index = pack_names_index; + options.pack_symlinks = pack_symlinks; + options.pack_symlinks_index = pack_symlinks_index; + options.force_pack_string_tables = true; + options.plain_names_table = plain_names_table; + options.plain_symlinks_table = plain_symlinks_table; if (set_uid) { options.uid = 0; @@ -518,11 +527,17 @@ class compression_test : public testing::TestWithParam< std::tuple> {}; -class scanner_test - : public testing::TestWithParam> { +class scanner_test : public testing::TestWithParam< + std::tuple> { }; +class packing_test : public testing::TestWithParam< + std::tuple> { +}; + +class plain_tables_test + : public testing::TestWithParam> {}; + TEST_P(compression_test, end_to_end) { auto [compressor, block_size_bits, file_order] = GetParam(); @@ -532,18 +547,38 @@ TEST_P(compression_test, end_to_end) { } basic_end_to_end_test(compressor, block_size_bits, file_order, true, true, - false, false, false, false, false, true, true, true); + false, false, false, false, false, true, true, true, + true, true, true, true, false, false); } TEST_P(scanner_test, end_to_end) { auto [with_devices, with_specials, set_uid, set_gid, set_time, keep_all_times, - enable_nlink, pack_chunk_table, pack_directories, - pack_shared_files_table] = GetParam(); + enable_nlink] = GetParam(); basic_end_to_end_test(compressions[0], 15, file_order_mode::NONE, with_devices, with_specials, set_uid, set_gid, set_time, - keep_all_times, enable_nlink, pack_chunk_table, - pack_directories, pack_shared_files_table); + keep_all_times, enable_nlink, true, true, true, true, + true, true, true, false, false); +} + +TEST_P(packing_test, end_to_end) { + auto [pack_chunk_table, pack_directories, pack_shared_files_table, pack_names, + pack_names_index, pack_symlinks, pack_symlinks_index] = GetParam(); + + basic_end_to_end_test(compressions[0], 15, file_order_mode::NONE, true, true, + false, false, false, false, false, pack_chunk_table, + pack_directories, pack_shared_files_table, pack_names, + pack_names_index, pack_symlinks, pack_symlinks_index, + false, false); +} + +TEST_P(plain_tables_test, end_to_end) { + auto [plain_names_table, plain_symlinks_table] = GetParam(); + + basic_end_to_end_test(compressions[0], 15, file_order_mode::NONE, true, true, + false, false, false, false, false, false, false, false, + false, false, false, false, plain_names_table, + plain_symlinks_table); } INSTANTIATE_TEST_SUITE_P( @@ -558,5 +593,14 @@ INSTANTIATE_TEST_SUITE_P( dwarfs, scanner_test, ::testing::Combine(::testing::Bool(), ::testing::Bool(), ::testing::Bool(), ::testing::Bool(), ::testing::Bool(), ::testing::Bool(), + ::testing::Bool())); + +INSTANTIATE_TEST_SUITE_P( + dwarfs, packing_test, + ::testing::Combine(::testing::Bool(), ::testing::Bool(), ::testing::Bool(), ::testing::Bool(), ::testing::Bool(), ::testing::Bool(), ::testing::Bool())); + +INSTANTIATE_TEST_SUITE_P(dwarfs, plain_tables_test, + ::testing::Combine(::testing::Bool(), + ::testing::Bool())); diff --git a/thrift/metadata.thrift b/thrift/metadata.thrift index b83175de..bba08040 100644 --- a/thrift/metadata.thrift +++ b/thrift/metadata.thrift @@ -142,6 +142,13 @@ struct fs_options { 5: required bool packed_shared_files_table, } +struct string_table { + 1: required string buffer, + 2: optional string symtab, + 3: required list index, + 4: required bool packed_index, +} + /** * File System Metadata * @@ -320,4 +327,8 @@ struct metadata { // unix timestamp of metadata creation time 23: optional UInt64 create_timestamp, + + 24: optional string_table compact_names, + + 25: optional string_table compact_symlinks, }