mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-07 19:41:54 -04:00
Flexible metadata packing
This commit is contained in:
parent
fdf29cd1d9
commit
7b38c0744f
@ -74,6 +74,9 @@ struct scanner_options {
|
||||
bool with_specials{false};
|
||||
uint32_t time_resolution_sec{1};
|
||||
inode_options inode;
|
||||
bool pack_chunk_table{false};
|
||||
bool pack_directories{false};
|
||||
bool pack_shared_files_table{false};
|
||||
};
|
||||
|
||||
struct rewrite_options {
|
||||
|
@ -35,6 +35,8 @@
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <boost/algorithm/string.hpp>
|
||||
|
||||
#include <thrift/lib/cpp2/frozen/FrozenUtil.h>
|
||||
#include <thrift/lib/cpp2/protocol/DebugProtocol.h>
|
||||
#include <thrift/lib/cpp2/protocol/Serializer.h>
|
||||
@ -125,11 +127,15 @@ class metadata_ final : public metadata_v2::impl {
|
||||
, nlinks_(build_nlinks(options))
|
||||
, chunk_table_(unpack_chunk_table())
|
||||
, directories_storage_(unpack_directories())
|
||||
, directories_(meta_.dir_entries() ? directories_storage_.data()
|
||||
: nullptr)
|
||||
, directories_(directories_storage_.empty() ? nullptr
|
||||
: directories_storage_.data())
|
||||
, shared_files_(decompress_shared_files())
|
||||
, unique_files_(dev_inode_offset_ - file_inode_offset_ -
|
||||
shared_files_.size())
|
||||
(shared_files_.empty()
|
||||
? meta_.shared_files_table()
|
||||
? meta_.shared_files_table()->size()
|
||||
: 0
|
||||
: shared_files_.size()))
|
||||
, options_(options) {
|
||||
if (static_cast<int>(meta_.directories().size() - 1) !=
|
||||
symlink_inode_offset_) {
|
||||
@ -350,22 +356,30 @@ class metadata_ final : public metadata_v2::impl {
|
||||
return chunk_table_.empty() ? meta_.chunk_table()[ino] : chunk_table_[ino];
|
||||
}
|
||||
|
||||
int file_inode_to_chunk_index(int inode) const {
|
||||
inode -= file_inode_offset_;
|
||||
|
||||
if (inode >= unique_files_) {
|
||||
inode -= unique_files_;
|
||||
|
||||
if (!shared_files_.empty()) {
|
||||
if (inode < static_cast<int>(shared_files_.size())) {
|
||||
inode = shared_files_[inode] + unique_files_;
|
||||
}
|
||||
} else if (auto sfp = meta_.shared_files_table()) {
|
||||
if (inode < static_cast<int>(sfp->size())) {
|
||||
inode = (*sfp)[inode] + unique_files_;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return inode;
|
||||
}
|
||||
|
||||
std::optional<chunk_range> get_chunk_range(int inode) const {
|
||||
std::optional<chunk_range> rv;
|
||||
|
||||
inode -= file_inode_offset_;
|
||||
|
||||
if (!shared_files_.empty()) {
|
||||
if (inode >= unique_files_) {
|
||||
inode -= unique_files_;
|
||||
|
||||
if (inode >= static_cast<int>(shared_files_.size())) {
|
||||
return rv;
|
||||
}
|
||||
|
||||
inode = shared_files_[inode] + unique_files_;
|
||||
}
|
||||
}
|
||||
inode = file_inode_to_chunk_index(inode);
|
||||
|
||||
if (inode >= 0 &&
|
||||
inode < (static_cast<int>(meta_.chunk_table().size()) - 1)) {
|
||||
@ -439,7 +453,7 @@ class metadata_ final : public metadata_v2::impl {
|
||||
std::vector<uint32_t> unpack_chunk_table() const {
|
||||
std::vector<uint32_t> chunk_table;
|
||||
|
||||
if (meta_.dir_entries()) {
|
||||
if (auto opts = meta_.options(); opts and opts->packed_chunk_table()) {
|
||||
chunk_table.resize(meta_.chunk_table().size());
|
||||
std::partial_sum(meta_.chunk_table().begin(), meta_.chunk_table().end(),
|
||||
chunk_table.begin());
|
||||
@ -451,8 +465,8 @@ class metadata_ final : public metadata_v2::impl {
|
||||
std::vector<thrift::metadata::directory> unpack_directories() const {
|
||||
std::vector<thrift::metadata::directory> directories;
|
||||
|
||||
if (auto dep = meta_.dir_entries()) {
|
||||
auto dirent = *dep;
|
||||
if (auto opts = meta_.options(); opts and opts->packed_directories()) {
|
||||
auto dirent = *meta_.dir_entries();
|
||||
auto metadir = meta_.directories();
|
||||
|
||||
{
|
||||
@ -503,8 +517,9 @@ class metadata_ final : public metadata_v2::impl {
|
||||
std::vector<uint32_t> decompress_shared_files() const {
|
||||
std::vector<uint32_t> decompressed;
|
||||
|
||||
if (auto sfp = meta_.shared_files_table()) {
|
||||
if (!sfp->empty()) {
|
||||
if (auto opts = meta_.options();
|
||||
opts and opts->packed_shared_files_table()) {
|
||||
if (auto sfp = meta_.shared_files_table(); sfp and !sfp->empty()) {
|
||||
auto ti = LOG_TIMED_DEBUG;
|
||||
|
||||
auto size = std::accumulate(sfp->begin(), sfp->end(), 2 * sfp->size());
|
||||
@ -656,6 +671,22 @@ void metadata_<LoggerPolicy>::dump(
|
||||
os << "block size: " << stbuf.f_bsize << std::endl;
|
||||
os << "inode count: " << stbuf.f_files << std::endl;
|
||||
os << "original filesystem size: " << stbuf.f_blocks << std::endl;
|
||||
if (auto opt = meta_.options()) {
|
||||
std::vector<std::string> options;
|
||||
auto boolopt = [&](auto const& name, bool value) {
|
||||
if (value) {
|
||||
options.push_back(name);
|
||||
}
|
||||
};
|
||||
boolopt("mtime_only", opt->mtime_only());
|
||||
boolopt("packed_chunk_table", opt->packed_chunk_table());
|
||||
boolopt("packed_directories", opt->packed_directories());
|
||||
boolopt("packed_shared_files_table", opt->packed_shared_files_table());
|
||||
os << "options: " << boost::join(options, "\n ") << std::endl;
|
||||
if (auto res = opt->time_resolution_sec()) {
|
||||
os << "time resolution: " << *res << " seconds" << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (detail_level > 1) {
|
||||
@ -681,9 +712,13 @@ void metadata_<LoggerPolicy>::dump(
|
||||
os << "dir_entries: " << de->size() << std::endl;
|
||||
}
|
||||
if (auto sfp = meta_.shared_files_table()) {
|
||||
os << "compressed shared_files_table: " << sfp->size() << std::endl;
|
||||
os << "decompressed shared_files_table: " << shared_files_.size()
|
||||
<< std::endl;
|
||||
if (meta_.options()->packed_shared_files_table()) {
|
||||
os << "compressed shared_files_table: " << sfp->size() << std::endl;
|
||||
os << "decompressed shared_files_table: " << shared_files_.size()
|
||||
<< std::endl;
|
||||
} else {
|
||||
os << "shared_files_table: " << sfp->size() << std::endl;
|
||||
}
|
||||
os << "unique files: " << unique_files_ << std::endl;
|
||||
}
|
||||
|
||||
@ -870,10 +905,7 @@ void metadata_<LoggerPolicy>::walk_data_order_impl(
|
||||
for (size_t ix = 0; ix < first_chunk_block.size(); ++ix) {
|
||||
int ino = (*dep)[ix].inode_num();
|
||||
if (ino >= file_inode_offset_ and ino < dev_inode_offset_) {
|
||||
ino -= file_inode_offset_;
|
||||
if (ino >= unique_files_) {
|
||||
ino = shared_files_[ino - unique_files_] + unique_files_;
|
||||
}
|
||||
ino = file_inode_to_chunk_index(ino);
|
||||
if (auto beg = chunk_table_lookup(ino);
|
||||
beg != chunk_table_lookup(ino + 1)) {
|
||||
first_chunk_block[ix] = meta_.chunks()[beg].block();
|
||||
|
@ -322,8 +322,8 @@ class save_shared_files_visitor : public visitor_base {
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<uint32_t>& get_compressed_shared_files() {
|
||||
if (!shared_files_.empty() && !compressed_) {
|
||||
void pack_shared_files() {
|
||||
if (!shared_files_.empty()) {
|
||||
DWARFS_CHECK(std::is_sorted(shared_files_.begin(), shared_files_.end()),
|
||||
"shared files vector not sorted");
|
||||
std::vector<uint32_t> compressed;
|
||||
@ -350,14 +350,13 @@ class save_shared_files_visitor : public visitor_base {
|
||||
|
||||
shared_files_.swap(compressed);
|
||||
}
|
||||
|
||||
return shared_files_;
|
||||
}
|
||||
|
||||
std::vector<uint32_t>& get_shared_files() { return shared_files_; }
|
||||
|
||||
private:
|
||||
uint32_t const begin_shared_;
|
||||
uint32_t const num_unique_;
|
||||
bool compressed_{false};
|
||||
std::vector<uint32_t> shared_files_;
|
||||
};
|
||||
|
||||
@ -705,30 +704,41 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
||||
root->accept(sdv);
|
||||
sdv.pack(mv2, ge_data);
|
||||
|
||||
// pack directories
|
||||
uint32_t last_first_entry = 0;
|
||||
for (auto& d : mv2.directories) {
|
||||
d.parent_entry = 0; // this will be recovered
|
||||
auto delta = d.first_entry - last_first_entry;
|
||||
last_first_entry = d.first_entry;
|
||||
d.first_entry = delta;
|
||||
if (options_.pack_directories) {
|
||||
// pack directories
|
||||
uint32_t last_first_entry = 0;
|
||||
|
||||
for (auto& d : mv2.directories) {
|
||||
d.parent_entry = 0; // this will be recovered
|
||||
auto delta = d.first_entry - last_first_entry;
|
||||
last_first_entry = d.first_entry;
|
||||
d.first_entry = delta;
|
||||
}
|
||||
}
|
||||
|
||||
// delta-compress chunk table
|
||||
std::adjacent_difference(mv2.chunk_table.begin(), mv2.chunk_table.end(),
|
||||
mv2.chunk_table.begin());
|
||||
if (options_.pack_chunk_table) {
|
||||
// delta-compress chunk table
|
||||
std::adjacent_difference(mv2.chunk_table.begin(), mv2.chunk_table.end(),
|
||||
mv2.chunk_table.begin());
|
||||
}
|
||||
|
||||
LOG_INFO << "saving shared files table...";
|
||||
save_shared_files_visitor ssfv(first_file_inode, first_device_inode,
|
||||
fdv.num_unique());
|
||||
root->accept(ssfv);
|
||||
mv2.shared_files_table_ref() = std::move(ssfv.get_compressed_shared_files());
|
||||
if (options_.pack_shared_files_table) {
|
||||
ssfv.pack_shared_files();
|
||||
}
|
||||
mv2.shared_files_table_ref() = std::move(ssfv.get_shared_files());
|
||||
|
||||
thrift::metadata::fs_options fsopts;
|
||||
fsopts.mtime_only = !options_.keep_all_times;
|
||||
if (options_.time_resolution_sec > 1) {
|
||||
fsopts.time_resolution_sec_ref() = options_.time_resolution_sec;
|
||||
}
|
||||
fsopts.packed_chunk_table = options_.pack_chunk_table;
|
||||
fsopts.packed_directories = options_.pack_directories;
|
||||
fsopts.packed_shared_files_table = options_.pack_shared_files_table;
|
||||
|
||||
mv2.uids = ge_data.get_uids();
|
||||
mv2.gids = ge_data.get_gids();
|
||||
|
@ -331,7 +331,7 @@ int mkdwarfs(int argc, char** argv) {
|
||||
block_manager::config cfg;
|
||||
std::string path, output, memory_limit, script_arg, compression,
|
||||
schema_compression, metadata_compression, log_level_str, timestamp,
|
||||
time_resolution, order, progress_mode, recompress_opts;
|
||||
time_resolution, order, progress_mode, recompress_opts, pack_metadata;
|
||||
size_t num_workers, max_scanner_workers;
|
||||
bool no_progress = false;
|
||||
unsigned level;
|
||||
@ -395,6 +395,9 @@ int mkdwarfs(int argc, char** argv) {
|
||||
("metadata-compression",
|
||||
po::value<std::string>(&metadata_compression),
|
||||
"metadata compression algorithm")
|
||||
("pack-metadata",
|
||||
po::value<std::string>(&pack_metadata)->default_value("all"),
|
||||
"pack certain metadata elements (none, chunk_table, directories, shared_files, all)")
|
||||
("recompress",
|
||||
po::value<std::string>(&recompress_opts)->implicit_value("all"),
|
||||
"recompress an existing filesystem (none, block, metadata, all)")
|
||||
@ -725,6 +728,30 @@ int mkdwarfs(int argc, char** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!pack_metadata.empty() and pack_metadata != "none") {
|
||||
if (pack_metadata == "all") {
|
||||
options.pack_chunk_table = true;
|
||||
options.pack_directories = true;
|
||||
options.pack_shared_files_table = true;
|
||||
} else {
|
||||
std::vector<std::string> pack_opts;
|
||||
boost::split(pack_opts, pack_metadata, boost::is_any_of(","));
|
||||
for (auto const& opt : pack_opts) {
|
||||
if (opt == "chunk_table") {
|
||||
options.pack_chunk_table = true;
|
||||
} else if (opt == "directories") {
|
||||
options.pack_directories = true;
|
||||
} else if (opt == "shared_files") {
|
||||
options.pack_shared_files_table = true;
|
||||
} else {
|
||||
std::cerr << "error: the argument ('" << opt
|
||||
<< "') to '--pack-metadata' is invalid" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unsigned interval_ms =
|
||||
pg_mode == console_writer::NONE || pg_mode == console_writer::SIMPLE
|
||||
? 2000
|
||||
|
@ -194,7 +194,9 @@ void basic_end_to_end_test(std::string const& compressor,
|
||||
unsigned block_size_bits, file_order_mode file_order,
|
||||
bool with_devices, bool with_specials, bool set_uid,
|
||||
bool set_gid, bool set_time, bool keep_all_times,
|
||||
bool enable_nlink) {
|
||||
bool enable_nlink, bool pack_chunk_table,
|
||||
bool pack_directories,
|
||||
bool pack_shared_files_table) {
|
||||
block_manager::config cfg;
|
||||
scanner_options options;
|
||||
|
||||
@ -207,6 +209,9 @@ void basic_end_to_end_test(std::string const& compressor,
|
||||
options.inode.with_similarity = file_order == file_order_mode::SIMILARITY;
|
||||
options.inode.with_nilsimsa = file_order == file_order_mode::NILSIMSA;
|
||||
options.keep_all_times = keep_all_times;
|
||||
options.pack_chunk_table = pack_chunk_table;
|
||||
options.pack_directories = pack_directories;
|
||||
options.pack_shared_files_table = pack_shared_files_table;
|
||||
|
||||
if (set_uid) {
|
||||
options.uid = 0;
|
||||
@ -513,8 +518,9 @@ class compression_test
|
||||
: public testing::TestWithParam<
|
||||
std::tuple<std::string, unsigned, file_order_mode>> {};
|
||||
|
||||
class scanner_test : public testing::TestWithParam<
|
||||
std::tuple<bool, bool, bool, bool, bool, bool, bool>> {
|
||||
class scanner_test
|
||||
: public testing::TestWithParam<std::tuple<bool, bool, bool, bool, bool,
|
||||
bool, bool, bool, bool, bool>> {
|
||||
};
|
||||
|
||||
TEST_P(compression_test, end_to_end) {
|
||||
@ -526,16 +532,18 @@ TEST_P(compression_test, end_to_end) {
|
||||
}
|
||||
|
||||
basic_end_to_end_test(compressor, block_size_bits, file_order, true, true,
|
||||
false, false, false, false, false);
|
||||
false, false, false, false, false, true, true, true);
|
||||
}
|
||||
|
||||
TEST_P(scanner_test, end_to_end) {
|
||||
auto [with_devices, with_specials, set_uid, set_gid, set_time, keep_all_times,
|
||||
enable_nlink] = GetParam();
|
||||
enable_nlink, pack_chunk_table, pack_directories,
|
||||
pack_shared_files_table] = GetParam();
|
||||
|
||||
basic_end_to_end_test(compressions[0], 15, file_order_mode::NONE,
|
||||
with_devices, with_specials, set_uid, set_gid, set_time,
|
||||
keep_all_times, enable_nlink);
|
||||
keep_all_times, enable_nlink, pack_chunk_table,
|
||||
pack_directories, pack_shared_files_table);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
@ -549,5 +557,6 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
dwarfs, scanner_test,
|
||||
::testing::Combine(::testing::Bool(), ::testing::Bool(), ::testing::Bool(),
|
||||
::testing::Bool(), ::testing::Bool(), ::testing::Bool(),
|
||||
::testing::Bool(), ::testing::Bool(), ::testing::Bool(),
|
||||
::testing::Bool()));
|
||||
|
@ -136,6 +136,10 @@ struct fs_options {
|
||||
// time base and offsets are stored with this resolution
|
||||
// 1 = seconds, 60 = minutes, 3600 = hours, ...
|
||||
2: optional UInt32 time_resolution_sec,
|
||||
|
||||
3: required bool packed_chunk_table,
|
||||
4: required bool packed_directories,
|
||||
5: required bool packed_shared_files_table,
|
||||
}
|
||||
|
||||
/**
|
||||
@ -168,8 +172,8 @@ struct metadata {
|
||||
* same for all directories.
|
||||
*
|
||||
* Note that this list is stored in a packed format as of v2.3
|
||||
* and needs to be unpacked before use. See the documentation
|
||||
* for the `directory` struct.
|
||||
* if `options.packed_directories` is `true` and must be unpacked
|
||||
* before use. See the documentation for the `directory` struct.
|
||||
*/
|
||||
2: required list<directory> directories,
|
||||
|
||||
@ -201,8 +205,9 @@ struct metadata {
|
||||
* There's one extra sentinel item at the end that points to the
|
||||
* end of `chunks`, so chunk lookups work the same for all inodes.
|
||||
*
|
||||
* Note that this is stored delta-compressed as of v2.3 and must
|
||||
* be unpacked before using.
|
||||
* Note that this list is stored delta-compressed as of v2.3
|
||||
* if `options.packed_chunk_table` is `true` and must be unpacked
|
||||
* before use.
|
||||
*/
|
||||
4: required list<UInt32> chunk_table,
|
||||
|
||||
@ -286,9 +291,13 @@ struct metadata {
|
||||
/**
|
||||
* Shared files mapping
|
||||
*
|
||||
* Note that this table cannot be used directly and must first
|
||||
* be unpacked. It is stored as number of repetitions per index,
|
||||
* offset by 2 (the minimum number of repetitions), so e.g.
|
||||
* Note that this list is stored in a packed format if
|
||||
* `options.packed_shared_files_table` is `true` and must be
|
||||
* unpacked before use.
|
||||
*
|
||||
* In packed format, it is stored as number of repetitions
|
||||
* per index, offset by 2 (the minimum number of repetitions),
|
||||
* so e.g. a packed list
|
||||
*
|
||||
* [0, 3, 1, 0, 1]
|
||||
*
|
||||
|
Loading…
x
Reference in New Issue
Block a user