refactor: factor out metadata_builder from scanner

This commit is contained in:
Marcus Holland-Moritz 2025-04-06 23:14:45 +02:00
parent 8455dc0229
commit dad4ea9fb7
11 changed files with 654 additions and 272 deletions

View File

@ -140,6 +140,7 @@ add_library(
src/writer/internal/inode_element_view.cpp
src/writer/internal/inode_manager.cpp
src/writer/internal/inode_ordering.cpp
src/writer/internal/metadata_builder.cpp
src/writer/internal/metadata_freezer.cpp
src/writer/internal/nilsimsa.cpp
src/writer/internal/progress.cpp

View File

@ -34,7 +34,7 @@
namespace dwarfs::writer {
struct scanner_options;
struct metadata_options;
namespace internal {
@ -46,8 +46,8 @@ class global_entry_data {
enum class timestamp_type { ATIME, MTIME, CTIME };
global_entry_data(scanner_options const& options)
: options_(options) {}
global_entry_data(metadata_options const& options)
: options_{options} {}
void add_uid(uid_type uid);
void add_gid(gid_type gid);
@ -108,7 +108,7 @@ class global_entry_data {
gid_type next_gid_index_{0};
mode_type next_mode_index_{0};
uint64_t timestamp_base_{std::numeric_limits<uint64_t>::max()};
scanner_options const& options_;
metadata_options const& options_;
};
} // namespace internal

View File

@ -0,0 +1,141 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/
#pragma once
#include <cstdint>
#include <memory>
#include <span>
#include <utility>
#include <vector>
namespace dwarfs {
class logger;
namespace writer {
struct metadata_options;
}
namespace thrift::metadata {
class metadata;
} // namespace thrift::metadata
namespace writer::internal {
class global_entry_data;
class inode_manager;
class block_manager;
class dir;
class metadata_builder {
public:
metadata_builder(logger& lgr, metadata_options const& options);
metadata_builder(logger& lgr, thrift::metadata::metadata const& md,
metadata_options const& options);
metadata_builder(logger& lgr, thrift::metadata::metadata&& md,
metadata_options const& options);
~metadata_builder();
void set_devices(std::vector<uint64_t> devices) {
impl_->set_devices(std::move(devices));
}
void set_symlink_table_size(size_t size) {
impl_->set_symlink_table_size(size);
}
void set_block_size(uint32_t block_size) {
impl_->set_block_size(block_size);
}
void set_total_fs_size(uint64_t total_fs_size) {
impl_->set_total_fs_size(total_fs_size);
}
void set_total_hardlink_size(uint64_t total_hardlink_size) {
impl_->set_total_hardlink_size(total_hardlink_size);
}
void set_shared_files_table(std::vector<uint32_t> shared_files) {
impl_->set_shared_files_table(std::move(shared_files));
}
void set_category_names(std::vector<std::string> category_names) {
impl_->set_category_names(std::move(category_names));
}
void set_block_categories(std::vector<uint32_t> block_categories) {
impl_->set_block_categories(std::move(block_categories));
}
void add_symlink_table_entry(size_t index, uint32_t entry) {
impl_->add_symlink_table_entry(index, entry);
}
void gather_chunks(inode_manager const& im, block_manager const& bm,
size_t chunk_count) {
impl_->gather_chunks(im, bm, chunk_count);
}
void gather_entries(std::span<dir*> dirs, global_entry_data const& ge_data,
uint32_t num_inodes) {
impl_->gather_entries(dirs, ge_data, num_inodes);
}
void gather_global_entry_data(global_entry_data const& ge_data) {
impl_->gather_global_entry_data(ge_data);
}
thrift::metadata::metadata const& build() { return impl_->build(); }
class impl {
public:
virtual ~impl() = default;
virtual void set_devices(std::vector<uint64_t> devices) = 0;
virtual void set_symlink_table_size(size_t size) = 0;
virtual void set_block_size(uint32_t block_size) = 0;
virtual void set_total_fs_size(uint64_t total_fs_size) = 0;
virtual void set_total_hardlink_size(uint64_t total_hardlink_size) = 0;
virtual void set_shared_files_table(std::vector<uint32_t> shared_files) = 0;
virtual void
set_category_names(std::vector<std::string> category_names) = 0;
virtual void
set_block_categories(std::vector<uint32_t> block_categories) = 0;
virtual void add_symlink_table_entry(size_t index, uint32_t entry) = 0;
virtual void gather_chunks(inode_manager const& im, block_manager const& bm,
size_t chunk_count) = 0;
virtual void
gather_entries(std::span<dir*> dirs, global_entry_data const& ge_data,
uint32_t num_inodes) = 0;
virtual void gather_global_entry_data(global_entry_data const& ge_data) = 0;
virtual thrift::metadata::metadata const& build() = 0;
};
private:
std::unique_ptr<impl> impl_;
};
} // namespace writer::internal
} // namespace dwarfs

View File

@ -0,0 +1,59 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/
#pragma once
#include <cstddef>
#include <cstdint>
#include <functional>
#include <optional>
#include <string>
#include <vector>
#include <dwarfs/file_stat.h>
#include <dwarfs/history_config.h>
#include <dwarfs/writer/inode_options.h>
namespace dwarfs::writer {
class entry_interface;
struct metadata_options {
std::optional<file_stat::uid_type> uid{};
std::optional<file_stat::gid_type> gid{};
std::optional<uint64_t> timestamp{};
bool keep_all_times{false};
uint32_t time_resolution_sec{1};
bool pack_chunk_table{false};
bool pack_directories{false};
bool pack_shared_files_table{false};
bool plain_names_table{false};
bool pack_names{false};
bool pack_names_index{false};
bool plain_symlinks_table{false};
bool pack_symlinks{false};
bool pack_symlinks_index{false};
bool force_pack_string_tables{false};
bool no_create_timestamp{false};
size_t inode_size_cache_min_chunk_count{128};
};
} // namespace dwarfs::writer

View File

@ -33,6 +33,7 @@
#include <dwarfs/file_stat.h>
#include <dwarfs/history_config.h>
#include <dwarfs/writer/inode_options.h>
#include <dwarfs/writer/metadata_options.h>
namespace dwarfs::writer {
@ -40,33 +41,17 @@ class entry_interface;
struct scanner_options {
std::optional<std::string> file_hash_algorithm{"xxh3-128"};
std::optional<file_stat::uid_type> uid;
std::optional<file_stat::gid_type> gid;
std::optional<uint64_t> timestamp;
bool keep_all_times{false};
bool remove_empty_dirs{false};
bool with_devices{false};
bool with_specials{false};
uint32_t time_resolution_sec{1};
inode_options inode;
bool pack_chunk_table{false};
bool pack_directories{false};
bool pack_shared_files_table{false};
bool plain_names_table{false};
bool pack_names{false};
bool pack_names_index{false};
bool plain_symlinks_table{false};
bool pack_symlinks{false};
bool pack_symlinks_index{false};
bool force_pack_string_tables{false};
bool no_create_timestamp{false};
std::optional<std::function<void(bool, writer::entry_interface const&)>>
debug_filter_function;
size_t num_segmenter_workers{1};
bool enable_history{true};
std::optional<std::vector<std::string>> command_line_arguments;
history_config history;
size_t inode_size_cache_min_chunk_count{128};
metadata_options metadata;
};
} // namespace dwarfs::writer

View File

@ -57,13 +57,11 @@ std::vector<T> global_entry_data::get_vector(map_type<T, U> const& map) const {
}
auto global_entry_data::get_uids() const -> std::vector<uid_type> {
return options_.uid ? std::vector<uid_type>{*options_.uid}
: get_vector(uids_);
return get_vector(uids_);
}
auto global_entry_data::get_gids() const -> std::vector<gid_type> {
return options_.gid ? std::vector<gid_type>{*options_.gid}
: get_vector(gids_);
return get_vector(gids_);
}
auto global_entry_data::get_modes() const -> std::vector<mode_type> {

View File

@ -0,0 +1,351 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/
#include <algorithm>
#include <ctime>
#include <filesystem>
#include <dwarfs/logger.h>
#include <dwarfs/version.h>
#include <dwarfs/writer/metadata_options.h>
#include <dwarfs/internal/features.h>
#include <dwarfs/internal/string_table.h>
#include <dwarfs/writer/internal/block_manager.h>
#include <dwarfs/writer/internal/entry.h>
#include <dwarfs/writer/internal/global_entry_data.h>
#include <dwarfs/writer/internal/inode_manager.h>
#include <dwarfs/writer/internal/metadata_builder.h>
#include <dwarfs/gen-cpp2/metadata_types.h>
namespace dwarfs::writer::internal {
namespace {
using namespace dwarfs::internal;
template <typename LoggerPolicy>
class metadata_builder_ final : public metadata_builder::impl {
public:
metadata_builder_(logger& lgr, metadata_options const& options)
: LOG_PROXY_INIT(lgr)
, options_{options} {}
metadata_builder_(logger& lgr, thrift::metadata::metadata const& md,
metadata_options const& options)
: LOG_PROXY_INIT(lgr)
, md_{md}
, options_{options} {}
metadata_builder_(logger& lgr, thrift::metadata::metadata&& md,
metadata_options const& options)
: LOG_PROXY_INIT(lgr)
, md_{std::move(md)}
, options_{options} {}
void set_devices(std::vector<uint64_t> devices) override {
md_.devices() = std::move(devices);
}
void set_symlink_table_size(size_t size) override {
md_.symlink_table()->resize(size);
}
void set_block_size(uint32_t block_size) override {
md_.block_size() = block_size;
}
void set_total_fs_size(uint64_t total_fs_size) override {
md_.total_fs_size() = total_fs_size;
}
void set_total_hardlink_size(uint64_t total_hardlink_size) override {
md_.total_hardlink_size() = total_hardlink_size;
}
void set_shared_files_table(std::vector<uint32_t> shared_files) override {
md_.shared_files_table() = std::move(shared_files);
}
void set_category_names(std::vector<std::string> category_names) override {
md_.category_names() = std::move(category_names);
}
void set_block_categories(std::vector<uint32_t> block_categories) override {
md_.block_categories() = std::move(block_categories);
}
void add_symlink_table_entry(size_t index, uint32_t entry) override {
DWARFS_NOTHROW(md_.symlink_table()->at(index)) = entry;
}
void gather_chunks(inode_manager const& im, block_manager const& bm,
size_t chunk_count) override;
void gather_entries(std::span<dir*> dirs, global_entry_data const& ge_data,
uint32_t num_inodes) override;
void gather_global_entry_data(global_entry_data const& ge_data) override;
thrift::metadata::metadata const& build() override;
private:
thrift::metadata::inode_size_cache build_inode_size_cache() const;
LOG_PROXY_DECL(LoggerPolicy);
thrift::metadata::metadata md_;
feature_set features_;
metadata_options const& options_;
};
template <typename LoggerPolicy>
thrift::metadata::inode_size_cache
metadata_builder_<LoggerPolicy>::build_inode_size_cache() const {
auto tv = LOG_TIMED_VERBOSE;
thrift::metadata::inode_size_cache cache;
cache.min_chunk_count() = options_.inode_size_cache_min_chunk_count;
auto const& chunk_table = md_.chunk_table().value();
auto const& chunks = md_.chunks().value();
for (size_t ino = 1; ino < chunk_table.size() - 1; ++ino) {
auto const begin = chunk_table[ino];
auto const end = chunk_table[ino + 1];
auto const num_chunks = end - begin;
if (num_chunks >= options_.inode_size_cache_min_chunk_count) {
uint64_t size = 0;
for (uint32_t ix = begin; ix < end; ++ix) {
auto const& chunk = chunks[ix];
size += chunk.size().value();
}
LOG_DEBUG << "caching size " << size << " for inode " << ino << " with "
<< num_chunks << " chunks";
cache.lookup()->emplace(ino, size);
}
}
tv << "building inode size cache...";
return cache;
}
template <typename LoggerPolicy>
void metadata_builder_<LoggerPolicy>::gather_chunks(inode_manager const& im,
block_manager const& bm,
size_t chunk_count) {
md_.chunk_table()->resize(im.count() + 1);
md_.chunks().value().reserve(chunk_count);
im.for_each_inode_in_order([&](std::shared_ptr<inode> const& ino) {
auto const total_chunks = md_.chunks()->size();
DWARFS_NOTHROW(md_.chunk_table()->at(ino->num())) = total_chunks;
if (!ino->append_chunks_to(md_.chunks().value())) {
std::ostringstream oss;
for (auto fp : ino->all()) {
oss << "\n " << fp->path_as_string();
}
LOG_ERROR << "inconsistent fragments in inode " << ino->num()
<< ", the following files will be empty:" << oss.str();
}
});
bm.map_logical_blocks(md_.chunks().value());
// insert dummy inode to help determine number of chunks per inode
DWARFS_NOTHROW(md_.chunk_table()->at(im.count())) = md_.chunks()->size();
LOG_DEBUG << "total number of unique files: " << im.count();
LOG_DEBUG << "total number of chunks: " << md_.chunks()->size();
}
template <typename LoggerPolicy>
void metadata_builder_<LoggerPolicy>::gather_entries(
std::span<dir*> dirs, global_entry_data const& ge_data,
uint32_t num_inodes) {
md_.dir_entries() = std::vector<thrift::metadata::dir_entry>();
md_.inodes()->resize(num_inodes);
md_.directories()->reserve(dirs.size() + 1);
for (auto p : dirs) {
if (!p->has_parent()) {
p->set_entry_index(md_.dir_entries()->size());
p->pack_entry(md_, ge_data);
}
p->pack(md_, ge_data);
}
thrift::metadata::directory dummy;
dummy.parent_entry() = 0;
dummy.first_entry() = md_.dir_entries()->size();
dummy.self_entry() = 0;
md_.directories()->push_back(dummy);
}
template <typename LoggerPolicy>
void metadata_builder_<LoggerPolicy>::gather_global_entry_data(
global_entry_data const& ge_data) {
md_.names() = ge_data.get_names();
md_.symlinks() = ge_data.get_symlinks();
md_.uids() = options_.uid ? std::vector<file_stat::uid_type>{*options_.uid}
: ge_data.get_uids();
md_.gids() = options_.gid ? std::vector<file_stat::gid_type>{*options_.gid}
: ge_data.get_gids();
md_.modes() = ge_data.get_modes();
md_.timestamp_base() = ge_data.get_timestamp_base();
}
template <typename LoggerPolicy>
thrift::metadata::metadata const& metadata_builder_<LoggerPolicy>::build() {
LOG_VERBOSE << "building metadata";
thrift::metadata::fs_options fsopts;
fsopts.mtime_only() = !options_.keep_all_times;
if (options_.time_resolution_sec > 1) {
fsopts.time_resolution_sec() = options_.time_resolution_sec;
}
fsopts.packed_chunk_table() = options_.pack_chunk_table;
fsopts.packed_directories() = options_.pack_directories;
fsopts.packed_shared_files_table() = options_.pack_shared_files_table;
if (options_.pack_directories) {
// pack directories
uint32_t last_first_entry = 0;
for (auto& d : md_.directories().value()) {
d.parent_entry() = 0; // this will be recovered
d.self_entry() = 0; // this will be recovered
auto delta = d.first_entry().value() - last_first_entry;
last_first_entry = d.first_entry().value();
d.first_entry() = delta;
}
}
md_.reg_file_size_cache() = build_inode_size_cache();
if (options_.pack_chunk_table) {
// delta-compress chunk table
std::adjacent_difference(md_.chunk_table()->begin(),
md_.chunk_table()->end(),
md_.chunk_table()->begin());
}
if (options_.pack_shared_files_table) {
if (!md_.shared_files_table()->empty()) {
auto& sf = md_.shared_files_table().value();
DWARFS_CHECK(std::ranges::is_sorted(sf),
"shared files vector not sorted");
std::vector<uint32_t> compressed;
compressed.reserve(sf.back() + 1);
uint32_t count = 0;
uint32_t index = 0;
for (auto i : sf) {
if (i == index) {
++count;
} else {
++index;
DWARFS_CHECK(i == index, "inconsistent shared files vector");
DWARFS_CHECK(count >= 2, "unique file in shared files vector");
compressed.emplace_back(count - 2);
count = 1;
}
}
compressed.emplace_back(count - 2);
DWARFS_CHECK(compressed.size() == sf.back() + 1,
"unexpected compressed vector size");
sf.swap(compressed);
}
}
if (!options_.plain_names_table) {
auto ti = LOG_TIMED_INFO;
md_.compact_names() = string_table::pack(
md_.names().value(), string_table::pack_options(
options_.pack_names, options_.pack_names_index,
options_.force_pack_string_tables));
thrift::metadata::metadata tmp;
md_.names().copy_from(tmp.names());
ti << "saving names table...";
}
if (!options_.plain_symlinks_table) {
auto ti = LOG_TIMED_INFO;
md_.compact_symlinks() = string_table::pack(
md_.symlinks().value(),
string_table::pack_options(options_.pack_symlinks,
options_.pack_symlinks_index,
options_.force_pack_string_tables));
thrift::metadata::metadata tmp;
md_.symlinks().copy_from(tmp.symlinks());
ti << "saving symlinks table...";
}
md_.options() = fsopts;
md_.features() = features_.get();
md_.dwarfs_version() = std::string("libdwarfs ") + DWARFS_GIT_ID;
if (!options_.no_create_timestamp) {
md_.create_timestamp() = std::time(nullptr);
}
md_.preferred_path_separator() =
static_cast<uint32_t>(std::filesystem::path::preferred_separator);
return md_;
}
} // namespace
metadata_builder::metadata_builder(logger& lgr, metadata_options const& options)
: impl_{
make_unique_logging_object<impl, metadata_builder_, logger_policies>(
lgr, options)} {}
metadata_builder::metadata_builder(logger& lgr,
thrift::metadata::metadata const& md,
metadata_options const& options)
: impl_{
make_unique_logging_object<impl, metadata_builder_, logger_policies>(
lgr, md, options)} {}
metadata_builder::metadata_builder(logger& lgr, thrift::metadata::metadata&& md,
metadata_options const& options)
: impl_{
make_unique_logging_object<impl, metadata_builder_, logger_policies>(
lgr, std::move(md), options)} {}
metadata_builder::~metadata_builder() = default;
} // namespace dwarfs::writer::internal

View File

@ -62,8 +62,6 @@
#include <dwarfs/writer/segmenter_factory.h>
#include <dwarfs/writer/writer_progress.h>
#include <dwarfs/internal/features.h>
#include <dwarfs/internal/string_table.h>
#include <dwarfs/internal/worker_group.h>
#include <dwarfs/writer/internal/block_manager.h>
#include <dwarfs/writer/internal/entry.h>
@ -73,11 +71,10 @@
#include <dwarfs/writer/internal/global_entry_data.h>
#include <dwarfs/writer/internal/inode.h>
#include <dwarfs/writer/internal/inode_manager.h>
#include <dwarfs/writer/internal/metadata_builder.h>
#include <dwarfs/writer/internal/metadata_freezer.h>
#include <dwarfs/writer/internal/progress.h>
#include <dwarfs/gen-cpp2/metadata_types.h>
namespace dwarfs::writer {
namespace internal {
@ -190,24 +187,7 @@ class save_directories_visitor : public visitor_base {
void visit(dir* p) override { directories_.at(p->inode_num().value()) = p; }
void pack(thrift::metadata::metadata& mv2, global_entry_data& ge_data) {
for (auto p : directories_) {
if (!p->has_parent()) {
p->set_entry_index(mv2.dir_entries()->size());
p->pack_entry(mv2, ge_data);
}
p->pack(mv2, ge_data);
}
thrift::metadata::directory dummy;
dummy.parent_entry() = 0;
dummy.first_entry() = mv2.dir_entries()->size();
dummy.self_entry() = 0;
mv2.directories()->push_back(dummy);
directories_.clear();
}
std::span<dir*> get_directories() { return directories_; }
private:
std::vector<dir*> directories_;
@ -232,36 +212,6 @@ class save_shared_files_visitor : public visitor_base {
}
}
void pack_shared_files() {
if (!shared_files_.empty()) {
DWARFS_CHECK(std::ranges::is_sorted(shared_files_),
"shared files vector not sorted");
std::vector<uint32_t> compressed;
compressed.reserve(shared_files_.back() + 1);
uint32_t count = 0;
uint32_t index = 0;
for (auto i : shared_files_) {
if (i == index) {
++count;
} else {
++index;
DWARFS_CHECK(i == index, "inconsistent shared files vector");
DWARFS_CHECK(count >= 2, "unique file in shared files vector");
compressed.emplace_back(count - 2);
count = 1;
}
}
compressed.emplace_back(count - 2);
DWARFS_CHECK(compressed.size() == shared_files_.back() + 1,
"unexpected compressed vector size");
shared_files_.swap(compressed);
}
}
std::vector<uint32_t>& get_shared_files() { return shared_files_; }
private:
@ -792,17 +742,14 @@ void scanner_<LoggerPolicy>::scan(
}
}
global_entry_data ge_data(options_);
thrift::metadata::metadata mv2;
feature_set features;
mv2.symlink_table()->resize(first_file_inode - first_link_inode);
global_entry_data ge_data(options_.metadata);
metadata_builder mdb(LOG_GET_LOGGER, options_.metadata);
LOG_INFO << "assigning device inodes...";
uint32_t first_pipe_inode = first_device_inode;
device_set_inode_visitor devsiv(first_pipe_inode);
root->accept(devsiv);
mv2.devices() = std::move(devsiv.device_ids());
mdb.set_devices(std::move(devsiv.device_ids()));
LOG_INFO << "assigning pipe/socket inodes...";
uint32_t last_inode = first_pipe_inode;
@ -811,6 +758,8 @@ void scanner_<LoggerPolicy>::scan(
LOG_INFO << "building metadata...";
mdb.set_symlink_table_size(first_file_inode - first_link_inode);
wg_.add_job([&] {
LOG_INFO << "saving names and symlinks...";
names_and_symlinks_visitor nlv(ge_data);
@ -821,10 +770,10 @@ void scanner_<LoggerPolicy>::scan(
LOG_INFO << "updating name and link indices...";
root->walk([&](entry* ep) {
ep->update(ge_data);
if (auto lp = dynamic_cast<link*>(ep)) {
DWARFS_NOTHROW(mv2.symlink_table()->at(ep->inode_num().value() -
first_link_inode)) =
ge_data.get_symlink_table_entry(lp->linkname());
if (auto* lp = dynamic_cast<link*>(ep)) {
mdb.add_symlink_table_entry(
ep->inode_num().value() - first_link_inode,
ge_data.get_symlink_table_entry(lp->linkname()));
}
});
});
@ -955,124 +904,18 @@ void scanner_<LoggerPolicy>::scan(
prog.run_sync([&] { root->clear_name(); });
LOG_INFO << "saving chunks...";
mv2.chunk_table()->resize(im.count() + 1);
auto& size_cache = mv2.reg_file_size_cache().emplace();
size_cache.min_chunk_count() = options_.inode_size_cache_min_chunk_count;
// TODO: we should be able to start this once all blocks have been
// submitted for compression
mv2.chunks().value().reserve(prog.chunk_count);
im.for_each_inode_in_order([&](std::shared_ptr<inode> const& ino) {
auto const total_chunks = mv2.chunks()->size();
DWARFS_NOTHROW(mv2.chunk_table()->at(ino->num())) = total_chunks;
if (!ino->append_chunks_to(mv2.chunks().value())) {
std::ostringstream oss;
for (auto fp : ino->all()) {
oss << "\n " << fp->path_as_string();
}
LOG_ERROR << "inconsistent fragments in inode " << ino->num()
<< ", the following files will be empty:" << oss.str();
}
auto num_inode_chunks = mv2.chunks()->size() - total_chunks;
if (num_inode_chunks >= options_.inode_size_cache_min_chunk_count) {
LOG_DEBUG << "caching size " << ino->size() << " for inode " << ino->num()
<< " with " << num_inode_chunks << " chunks";
size_cache.lookup()->emplace(ino->num(), ino->size());
}
});
blockmgr->map_logical_blocks(mv2.chunks().value());
// insert dummy inode to help determine number of chunks per inode
DWARFS_NOTHROW(mv2.chunk_table()->at(im.count())) = mv2.chunks()->size();
LOG_DEBUG << "total number of unique files: " << im.count();
LOG_DEBUG << "total number of chunks: " << mv2.chunks()->size();
mdb.gather_chunks(im, *blockmgr, prog.chunk_count);
LOG_INFO << "saving directories...";
mv2.dir_entries() = std::vector<thrift::metadata::dir_entry>();
mv2.inodes()->resize(last_inode);
mv2.directories()->reserve(first_link_inode + 1);
save_directories_visitor sdv(first_link_inode);
root->accept(sdv);
sdv.pack(mv2, ge_data);
if (options_.pack_directories) {
// pack directories
uint32_t last_first_entry = 0;
for (auto& d : mv2.directories().value()) {
d.parent_entry() = 0; // this will be recovered
d.self_entry() = 0; // this will be recovered
auto delta = d.first_entry().value() - last_first_entry;
last_first_entry = d.first_entry().value();
d.first_entry() = delta;
}
}
if (options_.pack_chunk_table) {
// delta-compress chunk table
std::adjacent_difference(mv2.chunk_table()->begin(),
mv2.chunk_table()->end(),
mv2.chunk_table()->begin());
}
mdb.gather_entries(sdv.get_directories(), ge_data, last_inode);
LOG_INFO << "saving shared files table...";
save_shared_files_visitor ssfv(first_file_inode, first_device_inode,
fs.num_unique());
root->accept(ssfv);
if (options_.pack_shared_files_table) {
ssfv.pack_shared_files();
}
mv2.shared_files_table() = std::move(ssfv.get_shared_files());
thrift::metadata::fs_options fsopts;
fsopts.mtime_only() = !options_.keep_all_times;
if (options_.time_resolution_sec > 1) {
fsopts.time_resolution_sec() = options_.time_resolution_sec;
}
fsopts.packed_chunk_table() = options_.pack_chunk_table;
fsopts.packed_directories() = options_.pack_directories;
fsopts.packed_shared_files_table() = options_.pack_shared_files_table;
if (options_.plain_names_table) {
mv2.names() = ge_data.get_names();
} else {
auto ti = LOG_TIMED_INFO;
mv2.compact_names() = string_table::pack(
ge_data.get_names(), string_table::pack_options(
options_.pack_names, options_.pack_names_index,
options_.force_pack_string_tables));
ti << "saving names table...";
}
if (options_.plain_symlinks_table) {
mv2.symlinks() = ge_data.get_symlinks();
} else {
auto ti = LOG_TIMED_INFO;
mv2.compact_symlinks() = string_table::pack(
ge_data.get_symlinks(),
string_table::pack_options(options_.pack_symlinks,
options_.pack_symlinks_index,
options_.force_pack_string_tables));
ti << "saving symlinks table...";
}
mv2.uids() = ge_data.get_uids();
mv2.gids() = ge_data.get_gids();
mv2.modes() = ge_data.get_modes();
mv2.timestamp_base() = ge_data.get_timestamp_base();
mv2.block_size() = segmenter_factory_.get_block_size();
mv2.total_fs_size() = prog.original_size;
mv2.total_hardlink_size() = prog.hardlink_size;
mv2.options() = fsopts;
mv2.dwarfs_version() = std::string("libdwarfs ") + DWARFS_GIT_ID;
if (!options_.no_create_timestamp) {
mv2.create_timestamp() = std::time(nullptr);
}
mv2.preferred_path_separator() =
static_cast<uint32_t>(std::filesystem::path::preferred_separator);
mdb.set_shared_files_table(std::move(ssfv.get_shared_files()));
if (auto catmgr = options_.inode.categorizer_mgr) {
std::unordered_map<fragment_category::value_type,
@ -1097,13 +940,16 @@ void scanner_<LoggerPolicy>::scan(
written_categories.begin(),
[&](auto const& cat) { return category_indices.at(cat); });
mv2.category_names() = std::move(category_names);
mv2.block_categories() = std::move(written_categories);
mdb.set_category_names(std::move(category_names));
mdb.set_block_categories(std::move(written_categories));
}
mv2.features() = features.get();
mdb.set_block_size(segmenter_factory_.get_block_size());
mdb.set_total_fs_size(prog.original_size);
mdb.set_total_hardlink_size(prog.hardlink_size);
mdb.gather_global_entry_data(ge_data);
auto [schema, data] = metadata_freezer::freeze(mv2);
auto [schema, data] = metadata_freezer::freeze(mdb.build());
LOG_VERBOSE << "uncompressed metadata size: " << size_with_unit(data.size());

View File

@ -117,17 +117,17 @@ make_filesystem(::benchmark::State const* state,
options.with_devices = true;
options.with_specials = true;
options.keep_all_times = false;
options.pack_chunk_table = true;
options.pack_directories = state ? state->range(0) : true;
options.pack_shared_files_table = true;
options.pack_names = state ? state->range(2) : true;
options.pack_names_index = state ? state->range(3) : true;
options.pack_symlinks = state ? state->range(2) : true;
options.pack_symlinks_index = state ? state->range(3) : true;
options.force_pack_string_tables = true;
options.plain_names_table = state ? state->range(1) : false;
options.plain_symlinks_table = state ? state->range(1) : false;
options.metadata.keep_all_times = false;
options.metadata.pack_chunk_table = true;
options.metadata.pack_directories = state ? state->range(0) : true;
options.metadata.pack_shared_files_table = true;
options.metadata.pack_names = state ? state->range(2) : true;
options.metadata.pack_names_index = state ? state->range(3) : true;
options.metadata.pack_symlinks = state ? state->range(2) : true;
options.metadata.pack_symlinks_index = state ? state->range(3) : true;
options.metadata.force_pack_string_tables = true;
options.metadata.plain_names_table = state ? state->range(1) : false;
options.metadata.plain_symlinks_table = state ? state->range(1) : false;
test::test_logger lgr;

View File

@ -156,28 +156,28 @@ void basic_end_to_end_test(
options.with_devices = with_devices;
options.with_specials = with_specials;
options.inode.fragment_order.set_default(order_opts);
options.keep_all_times = keep_all_times;
options.pack_chunk_table = pack_chunk_table;
options.pack_directories = pack_directories;
options.pack_shared_files_table = pack_shared_files_table;
options.pack_names = pack_names;
options.pack_names_index = pack_names_index;
options.pack_symlinks = pack_symlinks;
options.pack_symlinks_index = pack_symlinks_index;
options.force_pack_string_tables = true;
options.plain_names_table = plain_names_table;
options.plain_symlinks_table = plain_symlinks_table;
options.metadata.keep_all_times = keep_all_times;
options.metadata.pack_chunk_table = pack_chunk_table;
options.metadata.pack_directories = pack_directories;
options.metadata.pack_shared_files_table = pack_shared_files_table;
options.metadata.pack_names = pack_names;
options.metadata.pack_names_index = pack_names_index;
options.metadata.pack_symlinks = pack_symlinks;
options.metadata.pack_symlinks_index = pack_symlinks_index;
options.metadata.force_pack_string_tables = true;
options.metadata.plain_names_table = plain_names_table;
options.metadata.plain_symlinks_table = plain_symlinks_table;
if (set_uid) {
options.uid = 0;
options.metadata.uid = 0;
}
if (set_gid) {
options.gid = 0;
options.metadata.gid = 0;
}
if (set_time) {
options.timestamp = 4711;
options.metadata.timestamp = 4711;
}
test::test_logger lgr;
@ -697,14 +697,14 @@ TEST_P(packing_test, regression_empty_fs) {
cfg.blockhash_window_size = 8;
cfg.block_size_bits = 10;
options.pack_chunk_table = pack_chunk_table;
options.pack_directories = pack_directories;
options.pack_shared_files_table = pack_shared_files_table;
options.pack_names = pack_names;
options.pack_names_index = pack_names_index;
options.pack_symlinks = pack_symlinks;
options.pack_symlinks_index = pack_symlinks_index;
options.force_pack_string_tables = true;
options.metadata.pack_chunk_table = pack_chunk_table;
options.metadata.pack_directories = pack_directories;
options.metadata.pack_shared_files_table = pack_shared_files_table;
options.metadata.pack_names = pack_names;
options.metadata.pack_names_index = pack_names_index;
options.metadata.pack_symlinks = pack_symlinks;
options.metadata.pack_symlinks_index = pack_symlinks_index;
options.metadata.force_pack_string_tables = true;
test::test_logger lgr;
@ -917,7 +917,7 @@ TEST_P(file_scanner, inode_ordering) {
opts.file_hash_algorithm = file_hash_algo;
opts.inode.fragment_order.set_default(order_opts);
opts.no_create_timestamp = true;
opts.metadata.no_create_timestamp = true;
auto input = std::make_shared<test::os_access_mock>();
#if defined(DWARFS_TEST_RUNNING_ON_ASAN) || defined(DWARFS_TEST_RUNNING_ON_TSAN)
@ -1963,7 +1963,7 @@ TEST(filesystem, inode_size_cache) {
}
writer::scanner_options options;
options.inode_size_cache_min_chunk_count = 32;
options.metadata.inode_size_cache_min_chunk_count = 32;
writer::segmenter::config cfg;
cfg.block_size_bits = 16;

View File

@ -652,13 +652,13 @@ int mkdwarfs_main(int argc, sys_char** argv, iolayer const& iol) {
po::value<std::string>(&chmod_str),
"recursively apply permission changes")
("no-create-timestamp",
po::value<bool>(&options.no_create_timestamp)->zero_tokens(),
po::value<bool>(&options.metadata.no_create_timestamp)->zero_tokens(),
"don't add create timestamp to file system")
("set-time",
po::value<std::string>(&timestamp),
"set timestamp for whole file system (unixtime or 'now')")
("keep-all-times",
po::value<bool>(&options.keep_all_times)->zero_tokens(),
po::value<bool>(&options.metadata.keep_all_times)->zero_tokens(),
"save atime and ctime in addition to mtime")
("time-resolution",
po::value<std::string>(&time_resolution)->default_value("sec"),
@ -1004,22 +1004,23 @@ int mkdwarfs_main(int argc, sys_char** argv, iolayer const& iol) {
}
if (vm.contains("set-owner")) {
options.uid = uid;
options.metadata.uid = uid;
}
if (vm.contains("set-group")) {
options.gid = gid;
options.metadata.gid = gid;
}
if (vm.contains("set-time")) {
if (timestamp == "now") {
options.timestamp = std::time(nullptr);
options.metadata.timestamp = std::time(nullptr);
} else if (auto val = try_to<uint64_t>(timestamp)) {
options.timestamp = val;
options.metadata.timestamp = val;
} else {
try {
auto tp = parse_time_point(timestamp);
options.timestamp = std::chrono::duration_cast<std::chrono::seconds>(
options.metadata.timestamp =
std::chrono::duration_cast<std::chrono::seconds>(
tp.time_since_epoch())
.count();
} catch (std::exception const& e) {
@ -1031,10 +1032,10 @@ int mkdwarfs_main(int argc, sys_char** argv, iolayer const& iol) {
if (auto it = time_resolutions.find(time_resolution);
it != time_resolutions.end()) {
options.time_resolution_sec = it->second;
options.metadata.time_resolution_sec = it->second;
} else if (auto val = try_to<uint32_t>(time_resolution)) {
options.time_resolution_sec = *val;
if (options.time_resolution_sec == 0) {
options.metadata.time_resolution_sec = *val;
if (options.metadata.time_resolution_sec == 0) {
iol.err << "error: the argument to '--time-resolution' must be nonzero\n";
return 1;
}
@ -1046,45 +1047,45 @@ int mkdwarfs_main(int argc, sys_char** argv, iolayer const& iol) {
if (!pack_metadata.empty() and pack_metadata != "none") {
if (pack_metadata == "auto") {
options.force_pack_string_tables = false;
options.pack_chunk_table = false;
options.pack_directories = false;
options.pack_shared_files_table = false;
options.pack_names = true;
options.pack_names_index = false;
options.pack_symlinks = true;
options.pack_symlinks_index = false;
options.metadata.force_pack_string_tables = false;
options.metadata.pack_chunk_table = false;
options.metadata.pack_directories = false;
options.metadata.pack_shared_files_table = false;
options.metadata.pack_names = true;
options.metadata.pack_names_index = false;
options.metadata.pack_symlinks = true;
options.metadata.pack_symlinks_index = false;
} else {
auto pack_opts =
split_to<std::vector<std::string_view>>(pack_metadata, ',');
for (auto const& opt : pack_opts) {
if (opt == "chunk_table") {
options.pack_chunk_table = true;
options.metadata.pack_chunk_table = true;
} else if (opt == "directories") {
options.pack_directories = true;
options.metadata.pack_directories = true;
} else if (opt == "shared_files") {
options.pack_shared_files_table = true;
options.metadata.pack_shared_files_table = true;
} else if (opt == "names") {
options.pack_names = true;
options.metadata.pack_names = true;
} else if (opt == "names_index") {
options.pack_names_index = true;
options.metadata.pack_names_index = true;
} else if (opt == "symlinks") {
options.pack_symlinks = true;
options.metadata.pack_symlinks = true;
} else if (opt == "symlinks_index") {
options.pack_symlinks_index = true;
options.metadata.pack_symlinks_index = true;
} else if (opt == "force") {
options.force_pack_string_tables = true;
options.metadata.force_pack_string_tables = true;
} else if (opt == "plain") {
options.plain_names_table = true;
options.plain_symlinks_table = true;
options.metadata.plain_names_table = true;
options.metadata.plain_symlinks_table = true;
} else if (opt == "all") {
options.pack_chunk_table = true;
options.pack_directories = true;
options.pack_shared_files_table = true;
options.pack_names = true;
options.pack_names_index = true;
options.pack_symlinks = true;
options.pack_symlinks_index = true;
options.metadata.pack_chunk_table = true;
options.metadata.pack_directories = true;
options.metadata.pack_shared_files_table = true;
options.metadata.pack_names = true;
options.metadata.pack_names_index = true;
options.metadata.pack_symlinks = true;
options.metadata.pack_symlinks_index = true;
} else {
iol.err << "error: the argument ('" << opt
<< "') to '--pack-metadata' is invalid\n";