New compact string table representation

This commit is contained in:
Marcus Holland-Moritz 2021-03-20 21:00:35 +01:00
parent 955103fdab
commit df2c653d8a
14 changed files with 664 additions and 165 deletions

View File

@ -245,6 +245,7 @@ list(
src/dwarfs/progress.cpp
src/dwarfs/scanner.cpp
src/dwarfs/similarity.cpp
src/dwarfs/string_table.cpp
src/dwarfs/terminal.cpp
src/dwarfs/util.cpp
src/dwarfs/version.cpp

View File

@ -29,7 +29,6 @@
#include <memory>
#include <optional>
#include <string>
#include <string_view>
#include <utility>
#include <sys/types.h>
@ -109,7 +108,7 @@ class filesystem_v2 {
return impl_->opendir(entry);
}
std::optional<std::pair<inode_view, std::string_view>>
std::optional<std::pair<inode_view, std::string>>
readdir(directory_view dir, size_t offset) const {
return impl_->readdir(dir, offset);
}
@ -120,7 +119,7 @@ class filesystem_v2 {
return impl_->readlink(entry, buf);
}
folly::Expected<std::string_view, int> readlink(inode_view entry) const {
folly::Expected<std::string, int> readlink(inode_view entry) const {
return impl_->readlink(entry);
}
@ -161,11 +160,11 @@ class filesystem_v2 {
virtual int
access(inode_view entry, int mode, uid_t uid, gid_t gid) const = 0;
virtual std::optional<directory_view> opendir(inode_view entry) const = 0;
virtual std::optional<std::pair<inode_view, std::string_view>>
virtual std::optional<std::pair<inode_view, std::string>>
readdir(directory_view dir, size_t offset) const = 0;
virtual size_t dirsize(directory_view dir) const = 0;
virtual int readlink(inode_view entry, std::string* buf) const = 0;
virtual folly::Expected<std::string_view, int>
virtual folly::Expected<std::string, int>
readlink(inode_view entry) const = 0;
virtual int statvfs(struct ::statvfs* stbuf) const = 0;
virtual int open(inode_view entry) const = 0;

View File

@ -24,7 +24,6 @@
#include <cstddef>
#include <cstdint>
#include <optional>
#include <string_view>
#include <variant>
#include <boost/iterator/iterator_facade.hpp>
@ -32,6 +31,8 @@
#include <thrift/lib/cpp2/frozen/FrozenUtil.h>
#include "dwarfs/string_table.h"
#include "dwarfs/gen-cpp2/metadata_layouts.h"
namespace dwarfs {
@ -41,6 +42,27 @@ class metadata_;
class dir_entry_view;
class global_metadata {
public:
using Meta =
::apache::thrift::frozen::MappedFrozen<thrift::metadata::metadata>;
global_metadata(Meta const* meta);
Meta const* meta() const { return meta_; }
uint32_t first_dir_entry(uint32_t ino) const;
uint32_t parent_dir_entry(uint32_t ino) const;
string_table const& names() const { return names_; }
private:
Meta const* const meta_;
std::vector<thrift::metadata::directory> const directories_storage_;
thrift::metadata::directory const* const directories_;
string_table const names_;
};
class inode_view
: public ::apache::thrift::frozen::View<thrift::metadata::inode_data> {
using InodeView =
@ -89,18 +111,15 @@ class directory_view {
boost::integer_range<uint32_t> entry_range() const;
private:
directory_view(uint32_t inode, Meta const* meta,
thrift::metadata::directory const* directories = nullptr)
directory_view(uint32_t inode, global_metadata const* g)
: inode_{inode}
, directories_{directories}
, meta_{meta} {}
, g_{g} {}
uint32_t first_entry(uint32_t ino) const;
uint32_t parent_entry(uint32_t ino) const;
uint32_t inode_;
thrift::metadata::directory const* directories_;
Meta const* meta_;
global_metadata const* g_;
};
class dir_entry_view {
@ -108,14 +127,12 @@ class dir_entry_view {
::apache::thrift::frozen::View<thrift::metadata::inode_data>;
using DirEntryView =
::apache::thrift::frozen::View<thrift::metadata::dir_entry>;
using Meta =
::apache::thrift::frozen::MappedFrozen<thrift::metadata::metadata>;
template <typename T>
friend class metadata_;
public:
std::string_view name() const;
std::string name() const;
inode_view inode() const;
bool is_root() const;
@ -129,33 +146,33 @@ class dir_entry_view {
private:
dir_entry_view(DirEntryView v, uint32_t self_index, uint32_t parent_index,
Meta const* meta)
global_metadata const* g)
: v_{v}
, self_index_{self_index}
, parent_index_{parent_index}
, meta_{meta} {}
, g_{g} {}
dir_entry_view(InodeView v, uint32_t self_index, uint32_t parent_index,
Meta const* meta)
global_metadata const* g)
: v_{v}
, self_index_{self_index}
, parent_index_{parent_index}
, meta_{meta} {}
, g_{g} {}
static dir_entry_view
from_dir_entry_index(uint32_t self_index, uint32_t parent_index,
Meta const* meta);
global_metadata const* g);
static dir_entry_view
from_dir_entry_index(uint32_t self_index, Meta const* meta);
from_dir_entry_index(uint32_t self_index, global_metadata const* g);
// TODO: this works, but it's strange; a limited version of dir_entry_view
// should work without a parent for these use cases
static std::string_view name(uint32_t index, Meta const* meta);
static inode_view inode(uint32_t index, Meta const* meta);
static std::string name(uint32_t index, global_metadata const* g);
static inode_view inode(uint32_t index, global_metadata const* g);
std::variant<DirEntryView, InodeView> v_;
uint32_t self_index_, parent_index_;
Meta const* meta_;
global_metadata const* g_;
};
using chunk_view = ::apache::thrift::frozen::View<thrift::metadata::chunk>;

View File

@ -28,7 +28,6 @@
#include <memory>
#include <optional>
#include <string>
#include <string_view>
#include <utility>
#include <vector>
@ -104,7 +103,7 @@ class metadata_v2 {
return impl_->opendir(iv);
}
std::optional<std::pair<inode_view, std::string_view>>
std::optional<std::pair<inode_view, std::string>>
readdir(directory_view dir, size_t offset) const {
return impl_->readdir(dir, offset);
}
@ -121,7 +120,7 @@ class metadata_v2 {
return impl_->readlink(iv, buf);
}
folly::Expected<std::string_view, int> readlink(inode_view iv) const {
folly::Expected<std::string, int> readlink(inode_view iv) const {
return impl_->readlink(iv);
}
@ -165,7 +164,7 @@ class metadata_v2 {
virtual std::optional<directory_view> opendir(inode_view iv) const = 0;
virtual std::optional<std::pair<inode_view, std::string_view>>
virtual std::optional<std::pair<inode_view, std::string>>
readdir(directory_view dir, size_t offset) const = 0;
virtual size_t dirsize(directory_view dir) const = 0;
@ -176,8 +175,7 @@ class metadata_v2 {
virtual int readlink(inode_view iv, std::string* buf) const = 0;
virtual folly::Expected<std::string_view, int>
readlink(inode_view iv) const = 0;
virtual folly::Expected<std::string, int> readlink(inode_view iv) const = 0;
virtual int statvfs(struct ::statvfs* stbuf) const = 0;

View File

@ -77,6 +77,13 @@ struct scanner_options {
bool pack_chunk_table{false};
bool pack_directories{false};
bool pack_shared_files_table{false};
bool plain_names_table{false};
bool pack_names{false};
bool pack_names_index{false};
bool plain_symlinks_table{false};
bool pack_symlinks{false};
bool pack_symlinks_index{false};
bool force_pack_string_tables{false};
};
struct rewrite_options {

View File

@ -0,0 +1,71 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "dwarfs/gen-cpp2/metadata_layouts.h"
namespace dwarfs {
class string_table {
public:
using LegacyTableView =
::apache::thrift::frozen::View<std::vector<std::string>>;
using PackedTableView =
::apache::thrift::frozen::View<thrift::metadata::string_table>;
struct pack_options {
pack_options(bool pack_data = true, bool pack_index = true,
bool force_pack_data = false)
: pack_data{pack_data}
, pack_index{pack_index}
, force_pack_data{force_pack_data} {}
bool pack_data;
bool pack_index;
bool force_pack_data;
};
string_table(PackedTableView v);
string_table(LegacyTableView v);
std::string operator[](size_t index) const { return impl_->lookup(index); }
static thrift::metadata::string_table
pack(std::vector<std::string> const& input,
pack_options const& options = pack_options());
class impl {
public:
virtual ~impl() = default;
virtual std::string lookup(size_t index) const = 0;
};
private:
std::unique_ptr<impl const> impl_;
};
} // namespace dwarfs

View File

@ -190,12 +190,11 @@ class filesystem_ final : public filesystem_v2::impl {
int getattr(inode_view entry, struct ::stat* stbuf) const override;
int access(inode_view entry, int mode, uid_t uid, gid_t gid) const override;
std::optional<directory_view> opendir(inode_view entry) const override;
std::optional<std::pair<inode_view, std::string_view>>
std::optional<std::pair<inode_view, std::string>>
readdir(directory_view dir, size_t offset) const override;
size_t dirsize(directory_view dir) const override;
int readlink(inode_view entry, std::string* buf) const override;
folly::Expected<std::string_view, int>
readlink(inode_view entry) const override;
folly::Expected<std::string, int> readlink(inode_view entry) const override;
int statvfs(struct ::statvfs* stbuf) const override;
int open(inode_view entry) const override;
ssize_t
@ -325,7 +324,7 @@ filesystem_<LoggerPolicy>::opendir(inode_view entry) const {
}
template <typename LoggerPolicy>
std::optional<std::pair<inode_view, std::string_view>>
std::optional<std::pair<inode_view, std::string>>
filesystem_<LoggerPolicy>::readdir(directory_view dir, size_t offset) const {
return meta_.readdir(dir, offset);
}
@ -342,7 +341,7 @@ int filesystem_<LoggerPolicy>::readlink(inode_view entry,
}
template <typename LoggerPolicy>
folly::Expected<std::string_view, int>
folly::Expected<std::string, int>
filesystem_<LoggerPolicy>::readlink(inode_view entry) const {
return meta_.readlink(entry);
}

View File

@ -19,14 +19,84 @@
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/
#include "dwarfs/metadata_types.h"
#include <queue>
#include "dwarfs/error.h"
#include "dwarfs/metadata_types.h"
#include "dwarfs/overloaded.h"
#include "dwarfs/gen-cpp2/metadata_types_custom_protocol.h"
namespace dwarfs {
namespace {
std::vector<thrift::metadata::directory>
unpack_directories(global_metadata::Meta const* meta) {
std::vector<thrift::metadata::directory> directories;
if (auto opts = meta->options(); opts and opts->packed_directories()) {
auto dirent = *meta->dir_entries();
auto metadir = meta->directories();
{
directories.resize(metadir.size());
// delta-decode first entries first
directories[0].first_entry = metadir[0].first_entry();
for (size_t i = 1; i < directories.size(); ++i) {
directories[i].first_entry =
directories[i - 1].first_entry + metadir[i].first_entry();
}
// then traverse to recover parent entries
std::queue<uint32_t> queue;
queue.push(0);
while (!queue.empty()) {
auto parent = queue.front();
queue.pop();
auto p_ino = dirent[parent].inode_num();
auto beg = directories[p_ino].first_entry;
auto end = directories[p_ino + 1].first_entry;
for (auto e = beg; e < end; ++e) {
if (auto e_ino = dirent[e].inode_num();
e_ino < (directories.size() - 1)) {
directories[e_ino].parent_entry = parent;
queue.push(e);
}
}
}
}
}
return directories;
}
} // namespace
global_metadata::global_metadata(Meta const* meta)
: meta_{meta}
, directories_storage_{unpack_directories(meta_)}
, directories_{directories_storage_.empty() ? nullptr
: directories_storage_.data()}
, names_{meta_->compact_names() ? string_table(*meta_->compact_names())
: string_table(meta_->names())} {}
uint32_t global_metadata::first_dir_entry(uint32_t ino) const {
return directories_ ? directories_[ino].first_entry
: meta_->directories()[ino].first_entry();
}
uint32_t global_metadata::parent_dir_entry(uint32_t ino) const {
return directories_ ? directories_[ino].parent_entry
: meta_->directories()[ino].parent_entry();
}
uint16_t inode_view::mode() const { return meta_->modes()[mode_index()]; }
uint16_t inode_view::getuid() const { return meta_->uids()[owner_index()]; }
@ -35,13 +105,14 @@ uint16_t inode_view::getgid() const { return meta_->gids()[group_index()]; }
// TODO: pretty certain some of this stuff can be simplified
std::string_view dir_entry_view::name() const {
std::string dir_entry_view::name() const {
return std::visit(overloaded{
[this](DirEntryView const& dev) {
return meta_->names()[dev.name_index()];
return g_->names()[dev.name_index()];
},
[this](InodeView const& iv) {
return meta_->names()[iv.name_index_v2_2()];
return std::string(
g_->meta()->names()[iv.name_index_v2_2()]);
},
},
v_);
@ -50,11 +121,12 @@ std::string_view dir_entry_view::name() const {
inode_view dir_entry_view::inode() const {
return std::visit(overloaded{
[this](DirEntryView const& dev) {
return inode_view(meta_->inodes()[dev.inode_num()],
dev.inode_num(), meta_);
return inode_view(
g_->meta()->inodes()[dev.inode_num()],
dev.inode_num(), g_->meta());
},
[this](InodeView const& iv) {
return inode_view(iv, iv.inode_v2_2(), meta_);
return inode_view(iv, iv.inode_v2_2(), g_->meta());
},
},
v_);
@ -77,14 +149,16 @@ bool dir_entry_view::is_root() const {
dir_entry_view
dir_entry_view::from_dir_entry_index(uint32_t self_index, uint32_t parent_index,
Meta const* meta) {
global_metadata const* g) {
auto meta = g->meta();
if (auto de = meta->dir_entries()) {
DWARFS_CHECK(self_index < de->size(), "self_index out of range");
DWARFS_CHECK(parent_index < de->size(), "parent_index out of range");
auto dev = (*de)[self_index];
return dir_entry_view(dev, self_index, parent_index, meta);
return dir_entry_view(dev, self_index, parent_index, g);
}
DWARFS_CHECK(self_index < meta->inodes().size(), "self_index out of range");
@ -92,19 +166,20 @@ dir_entry_view::from_dir_entry_index(uint32_t self_index, uint32_t parent_index,
auto iv = meta->inodes()[self_index];
return dir_entry_view(iv, self_index, parent_index, meta);
return dir_entry_view(iv, self_index, parent_index, g);
}
dir_entry_view
dir_entry_view::from_dir_entry_index(uint32_t self_index, Meta const* meta) {
dir_entry_view dir_entry_view::from_dir_entry_index(uint32_t self_index,
global_metadata const* g) {
auto meta = g->meta();
if (auto de = meta->dir_entries()) {
DWARFS_CHECK(self_index < de->size(), "self_index out of range");
auto dev = (*de)[self_index];
DWARFS_CHECK(dev.inode_num() < meta->directories().size(),
"self_index inode out of range");
return dir_entry_view(dev, self_index,
meta->directories()[dev.inode_num()].parent_entry(),
meta);
return dir_entry_view(dev, self_index, g->parent_dir_entry(dev.inode_num()),
g);
}
DWARFS_CHECK(self_index < meta->inodes().size(), "self_index out of range");
@ -116,7 +191,7 @@ dir_entry_view::from_dir_entry_index(uint32_t self_index, Meta const* meta) {
iv, self_index,
meta->entry_table_v2_2()[meta->directories()[iv.inode_v2_2()]
.parent_entry()],
meta);
g);
}
std::optional<dir_entry_view> dir_entry_view::parent() const {
@ -124,31 +199,32 @@ std::optional<dir_entry_view> dir_entry_view::parent() const {
return std::nullopt;
}
return from_dir_entry_index(parent_index_, meta_);
return from_dir_entry_index(parent_index_, g_);
}
std::string_view dir_entry_view::name(uint32_t index, Meta const* meta) {
if (auto de = meta->dir_entries()) {
std::string dir_entry_view::name(uint32_t index, global_metadata const* g) {
if (auto de = g->meta()->dir_entries()) {
DWARFS_CHECK(index < de->size(), "index out of range");
auto dev = (*de)[index];
return meta->names()[dev.name_index()];
return g->names()[dev.name_index()];
}
DWARFS_CHECK(index < meta->inodes().size(), "index out of range");
auto iv = meta->inodes()[index];
return meta->names()[iv.name_index_v2_2()];
DWARFS_CHECK(index < g->meta()->inodes().size(), "index out of range");
auto iv = g->meta()->inodes()[index];
return std::string(g->meta()->names()[iv.name_index_v2_2()]);
}
inode_view dir_entry_view::inode(uint32_t index, Meta const* meta) {
if (auto de = meta->dir_entries()) {
inode_view dir_entry_view::inode(uint32_t index, global_metadata const* g) {
if (auto de = g->meta()->dir_entries()) {
DWARFS_CHECK(index < de->size(), "index out of range");
auto dev = (*de)[index];
return inode_view(meta->inodes()[dev.inode_num()], dev.inode_num(), meta);
return inode_view(g->meta()->inodes()[dev.inode_num()], dev.inode_num(),
g->meta());
}
DWARFS_CHECK(index < meta->inodes().size(), "index out of range");
auto iv = meta->inodes()[index];
return inode_view(iv, iv.inode_v2_2(), meta);
DWARFS_CHECK(index < g->meta()->inodes().size(), "index out of range");
auto iv = g->meta()->inodes()[index];
return inode_view(iv, iv.inode_v2_2(), g->meta());
}
std::string dir_entry_view::path() const {
@ -170,13 +246,11 @@ void dir_entry_view::append_path_to(std::string& s) const {
}
uint32_t directory_view::first_entry(uint32_t ino) const {
return directories_ ? directories_[ino].first_entry
: meta_->directories()[ino].first_entry();
return g_->first_dir_entry(ino);
}
uint32_t directory_view::parent_entry(uint32_t ino) const {
return directories_ ? directories_[ino].parent_entry
: meta_->directories()[ino].parent_entry();
return g_->parent_dir_entry(ino);
}
uint32_t directory_view::entry_count() const {
@ -194,7 +268,7 @@ uint32_t directory_view::parent_inode() const {
auto ent = parent_entry(inode_);
if (auto e = meta_->dir_entries()) {
if (auto e = g_->meta()->dir_entries()) {
ent = (*e)[ent].inode_num();
}

View File

@ -27,7 +27,6 @@
#include <ctime>
#include <numeric>
#include <ostream>
#include <queue>
#include <fcntl.h>
#include <sys/stat.h>
@ -45,10 +44,13 @@
#include <folly/container/F14Set.h>
#include <fsst.h>
#include "dwarfs/error.h"
#include "dwarfs/logger.h"
#include "dwarfs/metadata_v2.h"
#include "dwarfs/options.h"
#include "dwarfs/string_table.h"
#include "dwarfs/util.h"
#include "dwarfs/gen-cpp2/metadata_layouts.h"
@ -104,8 +106,6 @@ void analyze_frozen(std::ostream& os,
auto layout = meta.findFirstOfType<
std::unique_ptr<Layout<thrift::metadata::metadata>>>();
os << "metadata memory usage:\n";
auto& l = *layout;
std::vector<std::pair<size_t, std::string>> usage;
@ -139,13 +139,33 @@ void analyze_frozen(std::ostream& os,
auto add_string_list_size = [&](auto const& name, auto const& list,
auto const& field) {
auto count = list.size();
auto index_size = list_size(list, field);
auto data_size = list.back().end() - list.front().begin();
auto size = index_size + data_size;
auto fmt = fmt_size(name, count, size) +
fmt_detail("|- index", count, index_size) +
fmt_detail("'- data", count, data_size);
usage.emplace_back(size, fmt);
if (count > 0) {
auto index_size = list_size(list, field);
auto data_size = list.back().end() - list.front().begin();
auto size = index_size + data_size;
auto fmt = fmt_size(name, count, size) +
fmt_detail("|- data", count, data_size) +
fmt_detail("'- index", count, index_size);
usage.emplace_back(size, fmt);
}
};
auto add_string_table_size = [&](auto const& name, auto const& table,
auto const& field) {
if (auto data_size = table.buffer().size(); data_size > 0) {
auto dict_size =
table.symtab() ? table.symtab()->size() : static_cast<size_t>(0);
auto index_size = list_size(table.index(), field.layout.indexField);
auto size = index_size + data_size + dict_size;
auto count = table.index().size() - (table.packed_index() ? 0 : 1);
auto fmt =
fmt_size(name, count, size) + fmt_detail("|- data", count, data_size);
if (table.symtab()) {
fmt += fmt_detail("|- dict", count, dict_size);
}
fmt += fmt_detail("'- index", count, index_size);
usage.emplace_back(size, fmt);
}
};
#define META_LIST_SIZE(x) add_list_size(#x, meta.x(), l->x##Field)
@ -159,6 +179,13 @@ void analyze_frozen(std::ostream& os,
} \
} while (0)
#define META_OPT_STRING_TABLE_SIZE(x) \
do { \
if (auto table = meta.x()) { \
add_string_table_size(#x, *table, l->x##Field.layout.valueField); \
} \
} while (0)
META_LIST_SIZE(chunks);
META_LIST_SIZE(directories);
META_LIST_SIZE(inodes);
@ -172,17 +199,27 @@ void analyze_frozen(std::ostream& os,
META_OPT_LIST_SIZE(dir_entries);
META_OPT_LIST_SIZE(shared_files_table);
META_OPT_STRING_TABLE_SIZE(compact_names);
META_OPT_STRING_TABLE_SIZE(compact_symlinks);
META_STRING_LIST_SIZE(names);
META_STRING_LIST_SIZE(symlinks);
#undef META_LIST_SIZE
#undef META_STRING_LIST_SIZE
#undef META_OPT_LIST_SIZE
#undef META_OPT_STRING_TABLE_SIZE
std::sort(usage.begin(), usage.end(), [](auto const& a, auto const& b) {
return a.first > b.first || (a.first == b.first && a.second < b.second);
});
os << "metadata memory usage:\n";
os << fmt::format(
" {0:.<20}{1:.>13L} bytes {2:6.1f} bytes/inode\n",
"total metadata", total_size,
static_cast<double>(total_size) / meta.inodes().size());
for (auto const& u : usage) {
os << u.second;
}
@ -204,7 +241,8 @@ class metadata_ final : public metadata_v2::impl {
metadata_options const& options, int inode_offset)
: data_(data)
, meta_(map_frozen<thrift::metadata::metadata>(schema, data_))
, root_(dir_entry_view::from_dir_entry_index(0, &meta_))
, global_(&meta_)
, root_(dir_entry_view::from_dir_entry_index(0, &global_))
, log_(lgr)
, inode_offset_(inode_offset)
, symlink_inode_offset_(find_inode_offset(inode_rank::INO_LNK))
@ -214,9 +252,6 @@ class metadata_ final : public metadata_v2::impl {
: meta_.entry_table_v2_2().size())
, nlinks_(build_nlinks(options))
, chunk_table_(unpack_chunk_table())
, directories_storage_(unpack_directories())
, directories_(directories_storage_.empty() ? nullptr
: directories_storage_.data())
, shared_files_(decompress_shared_files())
, unique_files_(dev_inode_offset_ - file_inode_offset_ -
(shared_files_.empty()
@ -224,7 +259,10 @@ class metadata_ final : public metadata_v2::impl {
? meta_.shared_files_table()->size()
: 0
: shared_files_.size()))
, options_(options) {
, options_(options)
, symlinks_(meta_.compact_symlinks()
? string_table(*meta_.compact_symlinks())
: string_table(meta_.symlinks())) {
if (static_cast<int>(meta_.directories().size() - 1) !=
symlink_inode_offset_) {
DWARFS_THROW(
@ -304,7 +342,7 @@ class metadata_ final : public metadata_v2::impl {
std::optional<directory_view> opendir(inode_view iv) const override;
std::optional<std::pair<inode_view, std::string_view>>
std::optional<std::pair<inode_view, std::string>>
readdir(directory_view dir, size_t offset) const override;
size_t dirsize(directory_view dir) const override {
@ -317,7 +355,7 @@ class metadata_ final : public metadata_v2::impl {
int readlink(inode_view iv, std::string* buf) const override;
folly::Expected<std::string_view, int> readlink(inode_view iv) const override;
folly::Expected<std::string, int> readlink(inode_view iv) const override;
int statvfs(struct ::statvfs* stbuf) const override;
@ -339,7 +377,7 @@ class metadata_ final : public metadata_v2::impl {
dir_entry_view
make_dir_entry_view(uint32_t self_index, uint32_t parent_index) const {
return dir_entry_view::from_dir_entry_index(self_index, parent_index,
&meta_);
&global_);
}
// This represents the order in which inodes are stored in inodes
@ -420,7 +458,7 @@ class metadata_ final : public metadata_v2::impl {
directory_view make_directory_view(inode_view iv) const {
// TODO: revisit: is this the way to do it?
return directory_view(iv.inode_num(), &meta_, directories_);
return directory_view(iv.inode_num(), &global_);
}
// TODO: see if we really need to pass the extra dir_entry_view in
@ -525,9 +563,9 @@ class metadata_ final : public metadata_v2::impl {
return rv;
}
std::string_view link_value(inode_view iv) const {
return meta_.symlinks()[meta_.symlink_table()[iv.inode_num() -
symlink_inode_offset_]];
std::string link_value(inode_view iv) const {
return symlinks_[meta_.symlink_table()[iv.inode_num() -
symlink_inode_offset_]];
}
uint64_t get_device_id(int inode) const {
@ -550,58 +588,6 @@ class metadata_ final : public metadata_v2::impl {
return chunk_table;
}
std::vector<thrift::metadata::directory> unpack_directories() const {
std::vector<thrift::metadata::directory> directories;
if (auto opts = meta_.options(); opts and opts->packed_directories()) {
auto dirent = *meta_.dir_entries();
auto metadir = meta_.directories();
{
auto ti = LOG_TIMED_DEBUG;
directories.resize(metadir.size());
// delta-decode first entries first
directories[0].first_entry = metadir[0].first_entry();
for (size_t i = 1; i < directories.size(); ++i) {
directories[i].first_entry =
directories[i - 1].first_entry + metadir[i].first_entry();
}
// then traverse to recover parent entries
std::queue<uint32_t> queue;
queue.push(0);
while (!queue.empty()) {
auto parent = queue.front();
queue.pop();
auto p_ino = dirent[parent].inode_num();
auto beg = directories[p_ino].first_entry;
auto end = directories[p_ino + 1].first_entry;
for (auto e = beg; e < end; ++e) {
if (auto e_ino = dirent[e].inode_num();
e_ino < (directories.size() - 1)) {
directories[e_ino].parent_entry = parent;
queue.push(e);
}
}
}
ti << "unpacked directories table ("
<< size_with_unit(sizeof(directories.front()) *
directories.capacity())
<< ")";
}
}
return directories;
}
std::vector<uint32_t> decompress_shared_files() const {
std::vector<uint32_t> decompressed;
@ -664,6 +650,7 @@ class metadata_ final : public metadata_v2::impl {
folly::ByteRange data_;
MappedFrozen<thrift::metadata::metadata> meta_;
const global_metadata global_;
dir_entry_view root_;
log_proxy<LoggerPolicy> log_;
const int inode_offset_;
@ -673,11 +660,10 @@ class metadata_ final : public metadata_v2::impl {
const int inode_count_;
const std::vector<uint32_t> nlinks_;
const std::vector<uint32_t> chunk_table_;
const std::vector<thrift::metadata::directory> directories_storage_;
thrift::metadata::directory const* const directories_;
const std::vector<uint32_t> shared_files_;
const int unique_files_;
const metadata_options options_;
const string_table symlinks_;
};
template <typename LoggerPolicy>
@ -770,6 +756,14 @@ void metadata_<LoggerPolicy>::dump(
boolopt("packed_chunk_table", opt->packed_chunk_table());
boolopt("packed_directories", opt->packed_directories());
boolopt("packed_shared_files_table", opt->packed_shared_files_table());
if (auto names = meta_.compact_names()) {
boolopt("packed_names", static_cast<bool>(names->symtab()));
boolopt("packed_names_index", names->packed_index());
}
if (auto symlinks = meta_.compact_symlinks()) {
boolopt("packed_symlinks", static_cast<bool>(symlinks->symtab()));
boolopt("packed_symlinks_index", symlinks->packed_index());
}
os << "options: " << boost::join(options, "\n ") << std::endl;
if (auto res = opt->time_resolution_sec()) {
os << "time resolution: " << *res << " seconds" << std::endl;
@ -1041,14 +1035,14 @@ metadata_<LoggerPolicy>::find(directory_view dir, std::string_view name) const {
auto it = std::lower_bound(range.begin(), range.end(), name,
[&](auto ix, std::string_view name) {
return dir_entry_view::name(ix, &meta_) < name;
return dir_entry_view::name(ix, &global_) < name;
});
std::optional<inode_view> rv;
if (it != range.end()) {
if (dir_entry_view::name(*it, &meta_) == name) {
rv = dir_entry_view::inode(*it, &meta_);
if (dir_entry_view::name(*it, &global_) == name) {
rv = dir_entry_view::inode(*it, &global_);
}
}
@ -1157,7 +1151,7 @@ metadata_<LoggerPolicy>::opendir(inode_view iv) const {
}
template <typename LoggerPolicy>
std::optional<std::pair<inode_view, std::string_view>>
std::optional<std::pair<inode_view, std::string>>
metadata_<LoggerPolicy>::readdir(directory_view dir, size_t offset) const {
switch (offset) {
case 0:
@ -1174,8 +1168,8 @@ metadata_<LoggerPolicy>::readdir(directory_view dir, size_t offset) const {
}
auto index = dir.first_entry() + offset;
auto inode = dir_entry_view::inode(index, &meta_);
return std::pair(inode, dir_entry_view::name(index, &meta_));
auto inode = dir_entry_view::inode(index, &global_);
return std::pair(inode, dir_entry_view::name(index, &global_));
}
return std::nullopt;
@ -1235,7 +1229,7 @@ int metadata_<LoggerPolicy>::readlink(inode_view iv, std::string* buf) const {
}
template <typename LoggerPolicy>
folly::Expected<std::string_view, int>
folly::Expected<std::string, int>
metadata_<LoggerPolicy>::readlink(inode_view iv) const {
if (S_ISLNK(iv.mode())) {
return link_value(iv);

View File

@ -55,6 +55,7 @@
#include "dwarfs/progress.h"
#include "dwarfs/scanner.h"
#include "dwarfs/script.h"
#include "dwarfs/string_table.h"
#include "dwarfs/util.h"
#include "dwarfs/version.h"
#include "dwarfs/worker_group.h"
@ -740,11 +741,32 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
fsopts.packed_directories = options_.pack_directories;
fsopts.packed_shared_files_table = options_.pack_shared_files_table;
if (options_.plain_names_table) {
mv2.names = ge_data.get_names();
} else {
auto ti = LOG_TIMED_INFO;
mv2.set_compact_names(string_table::pack(
ge_data.get_names(), string_table::pack_options(
options_.pack_names, options_.pack_names_index,
options_.force_pack_string_tables)));
ti << "saving names table...";
}
if (options_.plain_symlinks_table) {
mv2.symlinks = ge_data.get_symlinks();
} else {
auto ti = LOG_TIMED_INFO;
mv2.set_compact_symlinks(string_table::pack(
ge_data.get_symlinks(),
string_table::pack_options(options_.pack_symlinks,
options_.pack_symlinks_index,
options_.force_pack_string_tables)));
ti << "saving symlinks table...";
}
mv2.uids = ge_data.get_uids();
mv2.gids = ge_data.get_gids();
mv2.modes = ge_data.get_modes();
mv2.names = ge_data.get_names();
mv2.symlinks = ge_data.get_symlinks();
mv2.timestamp_base = ge_data.get_timestamp_base();
mv2.block_size = UINT32_C(1) << cfg_.block_size_bits;
mv2.total_fs_size = prog.original_size;

248
src/dwarfs/string_table.cpp Normal file
View File

@ -0,0 +1,248 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/
#include <algorithm>
#include <numeric>
#include <fmt/format.h>
#include <fsst.h>
#include "dwarfs/error.h"
#include "dwarfs/string_table.h"
namespace dwarfs {
class legacy_string_table : public string_table::impl {
public:
legacy_string_table(string_table::LegacyTableView v)
: v_{v} {}
std::string lookup(size_t index) const override {
return std::string(v_[index]);
}
private:
string_table::LegacyTableView v_;
};
template <bool PackedData, bool PackedIndex>
class packed_string_table : public string_table::impl {
public:
packed_string_table(string_table::PackedTableView v)
: v_{v}
, buffer_{v_.buffer().data()} {
if constexpr (PackedData) {
auto st = v_.symtab();
DWARFS_CHECK(st, "symtab unexpectedly unset");
dec_ = std::make_unique<fsst_decoder_t>();
auto read = fsst_import(dec_.get(), reinterpret_cast<unsigned char*>(
const_cast<char*>(st->data())));
if (read != st->size()) {
DWARFS_THROW(runtime_error,
fmt::format("read {0} symtab bytes, expected {1}", read,
st->size()));
}
}
if constexpr (PackedIndex) {
DWARFS_CHECK(v_.packed_index(), "index unexpectedly not packed");
index_.resize(v_.index().size() + 1);
std::partial_sum(v_.index().begin(), v_.index().end(),
index_.begin() + 1);
}
}
std::string lookup(size_t index) const override {
auto beg = buffer_;
auto end = buffer_;
if constexpr (PackedIndex) {
beg += index_[index];
end += index_[index + 1];
} else {
beg += v_.index()[index];
end += v_.index()[index + 1];
}
if constexpr (PackedData) {
thread_local std::string out;
size_t size = end - beg;
out.resize(8 * size);
auto outlen = fsst_decompress(
dec_.get(), size,
reinterpret_cast<unsigned char*>(const_cast<char*>(beg)), out.size(),
reinterpret_cast<unsigned char*>(out.data()));
out.resize(outlen);
return out;
}
return std::string(beg, end);
}
private:
string_table::PackedTableView v_;
char const* const buffer_;
std::vector<uint32_t> index_;
std::unique_ptr<fsst_decoder_t> dec_;
};
string_table::string_table(LegacyTableView v)
: impl_{std::make_unique<legacy_string_table>(v)} {}
namespace {
std::unique_ptr<string_table::impl>
build_string_table(string_table::PackedTableView v) {
if (v.symtab()) {
if (v.packed_index()) {
return std::make_unique<packed_string_table<true, true>>(v);
} else {
return std::make_unique<packed_string_table<true, false>>(v);
}
} else {
if (v.packed_index()) {
return std::make_unique<packed_string_table<false, true>>(v);
} else {
return std::make_unique<packed_string_table<false, false>>(v);
}
}
}
} // namespace
string_table::string_table(PackedTableView v)
: impl_{build_string_table(v)} {}
thrift::metadata::string_table
string_table::pack(std::vector<std::string> const& input,
pack_options const& options) {
auto size = input.size();
bool pack_data = options.pack_data;
size_t total_input_size = 0;
std::string buffer;
std::string symtab;
std::vector<size_t> out_len_vec;
std::vector<unsigned char*> out_ptr_vec;
if (input.empty()) {
pack_data = false;
}
if (pack_data) {
std::vector<size_t> len_vec;
std::vector<unsigned char*> ptr_vec;
len_vec.reserve(size);
ptr_vec.reserve(size);
for (auto const& s : input) {
ptr_vec.emplace_back(
reinterpret_cast<unsigned char*>(const_cast<char*>(s.data())));
len_vec.emplace_back(s.size());
total_input_size += s.size();
}
std::unique_ptr<::fsst_encoder_t, decltype(&::fsst_destroy)> enc{
::fsst_create(size, len_vec.data(), ptr_vec.data(), 0),
&::fsst_destroy};
symtab.resize(sizeof(::fsst_decoder_t));
auto symtab_size = ::fsst_export(
enc.get(), reinterpret_cast<unsigned char*>(symtab.data()));
symtab.resize(symtab_size);
if (symtab.size() < total_input_size or options.force_pack_data) {
out_len_vec.resize(size);
out_ptr_vec.resize(size);
buffer.resize(options.force_pack_data ? total_input_size
: total_input_size - symtab.size());
size_t num_compressed = 0;
do {
num_compressed = ::fsst_compress(
enc.get(), size, len_vec.data(), ptr_vec.data(), buffer.size(),
reinterpret_cast<unsigned char*>(buffer.data()), out_len_vec.data(),
out_ptr_vec.data());
if (num_compressed == size) {
break;
}
buffer.resize(2 * buffer.size());
} while (options.force_pack_data);
pack_data = num_compressed == size;
} else {
pack_data = false;
}
} else {
for (auto const& s : input) {
total_input_size += s.size();
}
}
thrift::metadata::string_table output;
if (pack_data) {
// store compressed
size_t compressed_size =
(out_ptr_vec.back() - out_ptr_vec.front()) + out_len_vec.back();
DWARFS_CHECK(reinterpret_cast<char*>(out_ptr_vec.front()) == buffer.data(),
"string table compression pointer mismatch");
// TODO: only enable this in debug mode
DWARFS_CHECK(compressed_size == std::accumulate(out_len_vec.begin(),
out_len_vec.end(),
static_cast<size_t>(0)),
"string table compression pointer mismatch");
buffer.resize(compressed_size);
output.buffer.swap(buffer);
output.set_symtab(std::move(symtab));
output.index.resize(size);
std::copy(out_len_vec.begin(), out_len_vec.end(), output.index.begin());
} else {
// store uncompressed
output.buffer.reserve(total_input_size);
output.index.reserve(size);
for (auto const& s : input) {
output.buffer += s;
output.index.emplace_back(s.size());
}
}
output.packed_index = options.pack_index;
if (!options.pack_index) {
output.index.insert(output.index.begin(), 0);
std::partial_sum(output.index.begin(), output.index.end(),
output.index.begin());
}
return output;
}
} // namespace dwarfs

View File

@ -397,7 +397,9 @@ int mkdwarfs(int argc, char** argv) {
"metadata compression algorithm")
("pack-metadata",
po::value<std::string>(&pack_metadata)->default_value("all"),
"pack certain metadata elements (none, chunk_table, directories, shared_files, all)")
"pack certain metadata elements (none, all, chunk_table, "
"directories, shared_files, names, names_index, symlinks, "
"symlinks_index)")
("recompress",
po::value<std::string>(&recompress_opts)->implicit_value("all"),
"recompress an existing filesystem (none, block, metadata, all)")
@ -733,6 +735,10 @@ int mkdwarfs(int argc, char** argv) {
options.pack_chunk_table = true;
options.pack_directories = true;
options.pack_shared_files_table = true;
options.pack_names = true;
options.pack_names_index = true;
options.pack_symlinks = true;
options.pack_symlinks_index = true;
} else {
std::vector<std::string> pack_opts;
boost::split(pack_opts, pack_metadata, boost::is_any_of(","));
@ -743,6 +749,14 @@ int mkdwarfs(int argc, char** argv) {
options.pack_directories = true;
} else if (opt == "shared_files") {
options.pack_shared_files_table = true;
} else if (opt == "names") {
options.pack_names = true;
} else if (opt == "names_index") {
options.pack_names_index = true;
} else if (opt == "symlinks") {
options.pack_symlinks = true;
} else if (opt == "symlinks_index") {
options.pack_symlinks_index = true;
} else {
std::cerr << "error: the argument ('" << opt
<< "') to '--pack-metadata' is invalid" << std::endl;

View File

@ -195,8 +195,10 @@ void basic_end_to_end_test(std::string const& compressor,
bool with_devices, bool with_specials, bool set_uid,
bool set_gid, bool set_time, bool keep_all_times,
bool enable_nlink, bool pack_chunk_table,
bool pack_directories,
bool pack_shared_files_table) {
bool pack_directories, bool pack_shared_files_table,
bool pack_names, bool pack_names_index,
bool pack_symlinks, bool pack_symlinks_index,
bool plain_names_table, bool plain_symlinks_table) {
block_manager::config cfg;
scanner_options options;
@ -212,6 +214,13 @@ void basic_end_to_end_test(std::string const& compressor,
options.pack_chunk_table = pack_chunk_table;
options.pack_directories = pack_directories;
options.pack_shared_files_table = pack_shared_files_table;
options.pack_names = pack_names;
options.pack_names_index = pack_names_index;
options.pack_symlinks = pack_symlinks;
options.pack_symlinks_index = pack_symlinks_index;
options.force_pack_string_tables = true;
options.plain_names_table = plain_names_table;
options.plain_symlinks_table = plain_symlinks_table;
if (set_uid) {
options.uid = 0;
@ -518,11 +527,17 @@ class compression_test
: public testing::TestWithParam<
std::tuple<std::string, unsigned, file_order_mode>> {};
class scanner_test
: public testing::TestWithParam<std::tuple<bool, bool, bool, bool, bool,
bool, bool, bool, bool, bool>> {
class scanner_test : public testing::TestWithParam<
std::tuple<bool, bool, bool, bool, bool, bool, bool>> {
};
class packing_test : public testing::TestWithParam<
std::tuple<bool, bool, bool, bool, bool, bool, bool>> {
};
class plain_tables_test
: public testing::TestWithParam<std::tuple<bool, bool>> {};
TEST_P(compression_test, end_to_end) {
auto [compressor, block_size_bits, file_order] = GetParam();
@ -532,18 +547,38 @@ TEST_P(compression_test, end_to_end) {
}
basic_end_to_end_test(compressor, block_size_bits, file_order, true, true,
false, false, false, false, false, true, true, true);
false, false, false, false, false, true, true, true,
true, true, true, true, false, false);
}
TEST_P(scanner_test, end_to_end) {
auto [with_devices, with_specials, set_uid, set_gid, set_time, keep_all_times,
enable_nlink, pack_chunk_table, pack_directories,
pack_shared_files_table] = GetParam();
enable_nlink] = GetParam();
basic_end_to_end_test(compressions[0], 15, file_order_mode::NONE,
with_devices, with_specials, set_uid, set_gid, set_time,
keep_all_times, enable_nlink, pack_chunk_table,
pack_directories, pack_shared_files_table);
keep_all_times, enable_nlink, true, true, true, true,
true, true, true, false, false);
}
TEST_P(packing_test, end_to_end) {
auto [pack_chunk_table, pack_directories, pack_shared_files_table, pack_names,
pack_names_index, pack_symlinks, pack_symlinks_index] = GetParam();
basic_end_to_end_test(compressions[0], 15, file_order_mode::NONE, true, true,
false, false, false, false, false, pack_chunk_table,
pack_directories, pack_shared_files_table, pack_names,
pack_names_index, pack_symlinks, pack_symlinks_index,
false, false);
}
TEST_P(plain_tables_test, end_to_end) {
auto [plain_names_table, plain_symlinks_table] = GetParam();
basic_end_to_end_test(compressions[0], 15, file_order_mode::NONE, true, true,
false, false, false, false, false, false, false, false,
false, false, false, false, plain_names_table,
plain_symlinks_table);
}
INSTANTIATE_TEST_SUITE_P(
@ -558,5 +593,14 @@ INSTANTIATE_TEST_SUITE_P(
dwarfs, scanner_test,
::testing::Combine(::testing::Bool(), ::testing::Bool(), ::testing::Bool(),
::testing::Bool(), ::testing::Bool(), ::testing::Bool(),
::testing::Bool()));
INSTANTIATE_TEST_SUITE_P(
dwarfs, packing_test,
::testing::Combine(::testing::Bool(), ::testing::Bool(), ::testing::Bool(),
::testing::Bool(), ::testing::Bool(), ::testing::Bool(),
::testing::Bool()));
INSTANTIATE_TEST_SUITE_P(dwarfs, plain_tables_test,
::testing::Combine(::testing::Bool(),
::testing::Bool()));

View File

@ -142,6 +142,13 @@ struct fs_options {
5: required bool packed_shared_files_table,
}
struct string_table {
1: required string buffer,
2: optional string symtab,
3: required list<UInt32> index,
4: required bool packed_index,
}
/**
* File System Metadata
*
@ -320,4 +327,8 @@ struct metadata {
// unix timestamp of metadata creation time
23: optional UInt64 create_timestamp,
24: optional string_table compact_names,
25: optional string_table compact_symlinks,
}