Flexible metadata packing

This commit is contained in:
Marcus Holland-Moritz 2021-03-19 12:04:11 +01:00
parent fdf29cd1d9
commit 7b38c0744f
6 changed files with 148 additions and 58 deletions

View File

@ -74,6 +74,9 @@ struct scanner_options {
bool with_specials{false};
uint32_t time_resolution_sec{1};
inode_options inode;
bool pack_chunk_table{false};
bool pack_directories{false};
bool pack_shared_files_table{false};
};
struct rewrite_options {

View File

@ -35,6 +35,8 @@
#include <time.h>
#include <unistd.h>
#include <boost/algorithm/string.hpp>
#include <thrift/lib/cpp2/frozen/FrozenUtil.h>
#include <thrift/lib/cpp2/protocol/DebugProtocol.h>
#include <thrift/lib/cpp2/protocol/Serializer.h>
@ -125,11 +127,15 @@ class metadata_ final : public metadata_v2::impl {
, nlinks_(build_nlinks(options))
, chunk_table_(unpack_chunk_table())
, directories_storage_(unpack_directories())
, directories_(meta_.dir_entries() ? directories_storage_.data()
: nullptr)
, directories_(directories_storage_.empty() ? nullptr
: directories_storage_.data())
, shared_files_(decompress_shared_files())
, unique_files_(dev_inode_offset_ - file_inode_offset_ -
shared_files_.size())
(shared_files_.empty()
? meta_.shared_files_table()
? meta_.shared_files_table()->size()
: 0
: shared_files_.size()))
, options_(options) {
if (static_cast<int>(meta_.directories().size() - 1) !=
symlink_inode_offset_) {
@ -350,22 +356,30 @@ class metadata_ final : public metadata_v2::impl {
return chunk_table_.empty() ? meta_.chunk_table()[ino] : chunk_table_[ino];
}
int file_inode_to_chunk_index(int inode) const {
inode -= file_inode_offset_;
if (inode >= unique_files_) {
inode -= unique_files_;
if (!shared_files_.empty()) {
if (inode < static_cast<int>(shared_files_.size())) {
inode = shared_files_[inode] + unique_files_;
}
} else if (auto sfp = meta_.shared_files_table()) {
if (inode < static_cast<int>(sfp->size())) {
inode = (*sfp)[inode] + unique_files_;
}
}
}
return inode;
}
std::optional<chunk_range> get_chunk_range(int inode) const {
std::optional<chunk_range> rv;
inode -= file_inode_offset_;
if (!shared_files_.empty()) {
if (inode >= unique_files_) {
inode -= unique_files_;
if (inode >= static_cast<int>(shared_files_.size())) {
return rv;
}
inode = shared_files_[inode] + unique_files_;
}
}
inode = file_inode_to_chunk_index(inode);
if (inode >= 0 &&
inode < (static_cast<int>(meta_.chunk_table().size()) - 1)) {
@ -439,7 +453,7 @@ class metadata_ final : public metadata_v2::impl {
std::vector<uint32_t> unpack_chunk_table() const {
std::vector<uint32_t> chunk_table;
if (meta_.dir_entries()) {
if (auto opts = meta_.options(); opts and opts->packed_chunk_table()) {
chunk_table.resize(meta_.chunk_table().size());
std::partial_sum(meta_.chunk_table().begin(), meta_.chunk_table().end(),
chunk_table.begin());
@ -451,8 +465,8 @@ class metadata_ final : public metadata_v2::impl {
std::vector<thrift::metadata::directory> unpack_directories() const {
std::vector<thrift::metadata::directory> directories;
if (auto dep = meta_.dir_entries()) {
auto dirent = *dep;
if (auto opts = meta_.options(); opts and opts->packed_directories()) {
auto dirent = *meta_.dir_entries();
auto metadir = meta_.directories();
{
@ -503,8 +517,9 @@ class metadata_ final : public metadata_v2::impl {
std::vector<uint32_t> decompress_shared_files() const {
std::vector<uint32_t> decompressed;
if (auto sfp = meta_.shared_files_table()) {
if (!sfp->empty()) {
if (auto opts = meta_.options();
opts and opts->packed_shared_files_table()) {
if (auto sfp = meta_.shared_files_table(); sfp and !sfp->empty()) {
auto ti = LOG_TIMED_DEBUG;
auto size = std::accumulate(sfp->begin(), sfp->end(), 2 * sfp->size());
@ -656,6 +671,22 @@ void metadata_<LoggerPolicy>::dump(
os << "block size: " << stbuf.f_bsize << std::endl;
os << "inode count: " << stbuf.f_files << std::endl;
os << "original filesystem size: " << stbuf.f_blocks << std::endl;
if (auto opt = meta_.options()) {
std::vector<std::string> options;
auto boolopt = [&](auto const& name, bool value) {
if (value) {
options.push_back(name);
}
};
boolopt("mtime_only", opt->mtime_only());
boolopt("packed_chunk_table", opt->packed_chunk_table());
boolopt("packed_directories", opt->packed_directories());
boolopt("packed_shared_files_table", opt->packed_shared_files_table());
os << "options: " << boost::join(options, "\n ") << std::endl;
if (auto res = opt->time_resolution_sec()) {
os << "time resolution: " << *res << " seconds" << std::endl;
}
}
}
if (detail_level > 1) {
@ -681,9 +712,13 @@ void metadata_<LoggerPolicy>::dump(
os << "dir_entries: " << de->size() << std::endl;
}
if (auto sfp = meta_.shared_files_table()) {
os << "compressed shared_files_table: " << sfp->size() << std::endl;
os << "decompressed shared_files_table: " << shared_files_.size()
<< std::endl;
if (meta_.options()->packed_shared_files_table()) {
os << "compressed shared_files_table: " << sfp->size() << std::endl;
os << "decompressed shared_files_table: " << shared_files_.size()
<< std::endl;
} else {
os << "shared_files_table: " << sfp->size() << std::endl;
}
os << "unique files: " << unique_files_ << std::endl;
}
@ -870,10 +905,7 @@ void metadata_<LoggerPolicy>::walk_data_order_impl(
for (size_t ix = 0; ix < first_chunk_block.size(); ++ix) {
int ino = (*dep)[ix].inode_num();
if (ino >= file_inode_offset_ and ino < dev_inode_offset_) {
ino -= file_inode_offset_;
if (ino >= unique_files_) {
ino = shared_files_[ino - unique_files_] + unique_files_;
}
ino = file_inode_to_chunk_index(ino);
if (auto beg = chunk_table_lookup(ino);
beg != chunk_table_lookup(ino + 1)) {
first_chunk_block[ix] = meta_.chunks()[beg].block();

View File

@ -322,8 +322,8 @@ class save_shared_files_visitor : public visitor_base {
}
}
std::vector<uint32_t>& get_compressed_shared_files() {
if (!shared_files_.empty() && !compressed_) {
void pack_shared_files() {
if (!shared_files_.empty()) {
DWARFS_CHECK(std::is_sorted(shared_files_.begin(), shared_files_.end()),
"shared files vector not sorted");
std::vector<uint32_t> compressed;
@ -350,14 +350,13 @@ class save_shared_files_visitor : public visitor_base {
shared_files_.swap(compressed);
}
return shared_files_;
}
std::vector<uint32_t>& get_shared_files() { return shared_files_; }
private:
uint32_t const begin_shared_;
uint32_t const num_unique_;
bool compressed_{false};
std::vector<uint32_t> shared_files_;
};
@ -705,30 +704,41 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
root->accept(sdv);
sdv.pack(mv2, ge_data);
// pack directories
uint32_t last_first_entry = 0;
for (auto& d : mv2.directories) {
d.parent_entry = 0; // this will be recovered
auto delta = d.first_entry - last_first_entry;
last_first_entry = d.first_entry;
d.first_entry = delta;
if (options_.pack_directories) {
// pack directories
uint32_t last_first_entry = 0;
for (auto& d : mv2.directories) {
d.parent_entry = 0; // this will be recovered
auto delta = d.first_entry - last_first_entry;
last_first_entry = d.first_entry;
d.first_entry = delta;
}
}
// delta-compress chunk table
std::adjacent_difference(mv2.chunk_table.begin(), mv2.chunk_table.end(),
mv2.chunk_table.begin());
if (options_.pack_chunk_table) {
// delta-compress chunk table
std::adjacent_difference(mv2.chunk_table.begin(), mv2.chunk_table.end(),
mv2.chunk_table.begin());
}
LOG_INFO << "saving shared files table...";
save_shared_files_visitor ssfv(first_file_inode, first_device_inode,
fdv.num_unique());
root->accept(ssfv);
mv2.shared_files_table_ref() = std::move(ssfv.get_compressed_shared_files());
if (options_.pack_shared_files_table) {
ssfv.pack_shared_files();
}
mv2.shared_files_table_ref() = std::move(ssfv.get_shared_files());
thrift::metadata::fs_options fsopts;
fsopts.mtime_only = !options_.keep_all_times;
if (options_.time_resolution_sec > 1) {
fsopts.time_resolution_sec_ref() = options_.time_resolution_sec;
}
fsopts.packed_chunk_table = options_.pack_chunk_table;
fsopts.packed_directories = options_.pack_directories;
fsopts.packed_shared_files_table = options_.pack_shared_files_table;
mv2.uids = ge_data.get_uids();
mv2.gids = ge_data.get_gids();

View File

@ -331,7 +331,7 @@ int mkdwarfs(int argc, char** argv) {
block_manager::config cfg;
std::string path, output, memory_limit, script_arg, compression,
schema_compression, metadata_compression, log_level_str, timestamp,
time_resolution, order, progress_mode, recompress_opts;
time_resolution, order, progress_mode, recompress_opts, pack_metadata;
size_t num_workers, max_scanner_workers;
bool no_progress = false;
unsigned level;
@ -395,6 +395,9 @@ int mkdwarfs(int argc, char** argv) {
("metadata-compression",
po::value<std::string>(&metadata_compression),
"metadata compression algorithm")
("pack-metadata",
po::value<std::string>(&pack_metadata)->default_value("all"),
"pack certain metadata elements (none, chunk_table, directories, shared_files, all)")
("recompress",
po::value<std::string>(&recompress_opts)->implicit_value("all"),
"recompress an existing filesystem (none, block, metadata, all)")
@ -725,6 +728,30 @@ int mkdwarfs(int argc, char** argv) {
return 1;
}
if (!pack_metadata.empty() and pack_metadata != "none") {
if (pack_metadata == "all") {
options.pack_chunk_table = true;
options.pack_directories = true;
options.pack_shared_files_table = true;
} else {
std::vector<std::string> pack_opts;
boost::split(pack_opts, pack_metadata, boost::is_any_of(","));
for (auto const& opt : pack_opts) {
if (opt == "chunk_table") {
options.pack_chunk_table = true;
} else if (opt == "directories") {
options.pack_directories = true;
} else if (opt == "shared_files") {
options.pack_shared_files_table = true;
} else {
std::cerr << "error: the argument ('" << opt
<< "') to '--pack-metadata' is invalid" << std::endl;
return 1;
}
}
}
}
unsigned interval_ms =
pg_mode == console_writer::NONE || pg_mode == console_writer::SIMPLE
? 2000

View File

@ -194,7 +194,9 @@ void basic_end_to_end_test(std::string const& compressor,
unsigned block_size_bits, file_order_mode file_order,
bool with_devices, bool with_specials, bool set_uid,
bool set_gid, bool set_time, bool keep_all_times,
bool enable_nlink) {
bool enable_nlink, bool pack_chunk_table,
bool pack_directories,
bool pack_shared_files_table) {
block_manager::config cfg;
scanner_options options;
@ -207,6 +209,9 @@ void basic_end_to_end_test(std::string const& compressor,
options.inode.with_similarity = file_order == file_order_mode::SIMILARITY;
options.inode.with_nilsimsa = file_order == file_order_mode::NILSIMSA;
options.keep_all_times = keep_all_times;
options.pack_chunk_table = pack_chunk_table;
options.pack_directories = pack_directories;
options.pack_shared_files_table = pack_shared_files_table;
if (set_uid) {
options.uid = 0;
@ -513,8 +518,9 @@ class compression_test
: public testing::TestWithParam<
std::tuple<std::string, unsigned, file_order_mode>> {};
class scanner_test : public testing::TestWithParam<
std::tuple<bool, bool, bool, bool, bool, bool, bool>> {
class scanner_test
: public testing::TestWithParam<std::tuple<bool, bool, bool, bool, bool,
bool, bool, bool, bool, bool>> {
};
TEST_P(compression_test, end_to_end) {
@ -526,16 +532,18 @@ TEST_P(compression_test, end_to_end) {
}
basic_end_to_end_test(compressor, block_size_bits, file_order, true, true,
false, false, false, false, false);
false, false, false, false, false, true, true, true);
}
TEST_P(scanner_test, end_to_end) {
auto [with_devices, with_specials, set_uid, set_gid, set_time, keep_all_times,
enable_nlink] = GetParam();
enable_nlink, pack_chunk_table, pack_directories,
pack_shared_files_table] = GetParam();
basic_end_to_end_test(compressions[0], 15, file_order_mode::NONE,
with_devices, with_specials, set_uid, set_gid, set_time,
keep_all_times, enable_nlink);
keep_all_times, enable_nlink, pack_chunk_table,
pack_directories, pack_shared_files_table);
}
INSTANTIATE_TEST_SUITE_P(
@ -549,5 +557,6 @@ INSTANTIATE_TEST_SUITE_P(
INSTANTIATE_TEST_SUITE_P(
dwarfs, scanner_test,
::testing::Combine(::testing::Bool(), ::testing::Bool(), ::testing::Bool(),
::testing::Bool(), ::testing::Bool(), ::testing::Bool(),
::testing::Bool(), ::testing::Bool(), ::testing::Bool(),
::testing::Bool()));

View File

@ -136,6 +136,10 @@ struct fs_options {
// time base and offsets are stored with this resolution
// 1 = seconds, 60 = minutes, 3600 = hours, ...
2: optional UInt32 time_resolution_sec,
3: required bool packed_chunk_table,
4: required bool packed_directories,
5: required bool packed_shared_files_table,
}
/**
@ -168,8 +172,8 @@ struct metadata {
* same for all directories.
*
* Note that this list is stored in a packed format as of v2.3
* and needs to be unpacked before use. See the documentation
* for the `directory` struct.
* if `options.packed_directories` is `true` and must be unpacked
* before use. See the documentation for the `directory` struct.
*/
2: required list<directory> directories,
@ -201,8 +205,9 @@ struct metadata {
* There's one extra sentinel item at the end that points to the
* end of `chunks`, so chunk lookups work the same for all inodes.
*
* Note that this is stored delta-compressed as of v2.3 and must
* be unpacked before using.
* Note that this list is stored delta-compressed as of v2.3
* if `options.packed_chunk_table` is `true` and must be unpacked
* before use.
*/
4: required list<UInt32> chunk_table,
@ -286,9 +291,13 @@ struct metadata {
/**
* Shared files mapping
*
* Note that this table cannot be used directly and must first
* be unpacked. It is stored as number of repetitions per index,
* offset by 2 (the minimum number of repetitions), so e.g.
* Note that this list is stored in a packed format if
* `options.packed_shared_files_table` is `true` and must be
* unpacked before use.
*
* In packed format, it is stored as number of repetitions
* per index, offset by 2 (the minimum number of repetitions),
* so e.g. a packed list
*
* [0, 3, 1, 0, 1]
*