More metadata tweaks, also document thrift metadata

This commit is contained in:
Marcus Holland-Moritz 2020-11-26 22:17:29 +01:00
parent f373144b73
commit 603e2c7ca3
7 changed files with 151 additions and 43 deletions

View File

@ -39,6 +39,8 @@
namespace dwarfs { namespace dwarfs {
struct global_entry_data { struct global_entry_data {
global_entry_data(bool no_time) : no_time_(no_time) {}
void add_uid(uint16_t uid) { add(uid, uids, next_uid_index); } void add_uid(uint16_t uid) { add(uid, uids, next_uid_index); }
void add_gid(uint16_t gid) { add(gid, gids, next_gid_index); } void add_gid(uint16_t gid) { add(gid, gids, next_gid_index); }
@ -84,7 +86,7 @@ struct global_entry_data {
} }
uint64_t get_time_offset(uint64_t time) const { uint64_t get_time_offset(uint64_t time) const {
return time - timestamp_base; return no_time_ ? 0 : time - timestamp_base;
} }
std::vector<uint16_t> get_uids() const; std::vector<uint16_t> get_uids() const;
@ -110,6 +112,7 @@ struct global_entry_data {
uint16_t next_gid_index{0}; uint16_t next_gid_index{0};
uint16_t next_mode_index{0}; uint16_t next_mode_index{0};
uint64_t timestamp_base{std::numeric_limits<uint64_t>::max()}; uint64_t timestamp_base{std::numeric_limits<uint64_t>::max()};
bool no_time_;
}; };
class file; class file;

View File

@ -37,5 +37,6 @@ std::ostream& operator<<(std::ostream& os, file_order_mode mode);
struct scanner_options { struct scanner_options {
file_order_mode file_order; file_order_mode file_order;
bool no_time;
}; };
} // namespace dwarfs } // namespace dwarfs

View File

@ -89,7 +89,7 @@ class dir_ : public dir {
void pack_entry(thrift::metadata::metadata& mv2, void pack_entry(thrift::metadata::metadata& mv2,
global_entry_data const& data) const override { global_entry_data const& data) const override {
mv2.inode_index.at(inode_num()) = mv2.entries.size(); mv2.entry_index.at(inode_num()) = mv2.entries.size();
mv2.entries.emplace_back(); mv2.entries.emplace_back();
entry::pack(mv2.entries.back(), data); entry::pack(mv2.entries.back(), data);
} }
@ -121,7 +121,6 @@ class dir_ : public dir {
void pack(thrift::metadata::metadata& mv2, void pack(thrift::metadata::metadata& mv2,
global_entry_data const& data) const override { global_entry_data const& data) const override {
thrift::metadata::directory dir; thrift::metadata::directory dir;
dir.self_inode = inode_num();
dir.parent_inode = dir.parent_inode =
has_parent() ? std::dynamic_pointer_cast<dir_>(parent())->inode_num() has_parent() ? std::dynamic_pointer_cast<dir_>(parent())->inode_num()
: 0; : 0;
@ -129,7 +128,7 @@ class dir_ : public dir {
dir.entry_count = entries_.size(); dir.entry_count = entries_.size();
mv2.directories.push_back(dir); mv2.directories.push_back(dir);
for (entry_ptr const& e : entries_) { for (entry_ptr const& e : entries_) {
mv2.inode_index.at(e->inode_num()) = mv2.entries.size(); mv2.entry_index.at(e->inode_num()) = mv2.entries.size();
mv2.entries.emplace_back(); mv2.entries.emplace_back();
e->pack(mv2.entries.back(), data); e->pack(mv2.entries.back(), data);
} }

View File

@ -51,7 +51,7 @@ class metadata_v2_ : public metadata_v2::impl {
: data_(std::move(meta)) : data_(std::move(meta))
, meta_(::apache::thrift::frozen::mapFrozen<thrift::metadata::metadata>( , meta_(::apache::thrift::frozen::mapFrozen<thrift::metadata::metadata>(
data_)) data_))
, root_(meta_.entries()[meta_.inode_index()[0]]) , root_(meta_.entries()[meta_.entry_index()[0]])
, inode_offset_(meta_.chunk_index_offset()) , inode_offset_(meta_.chunk_index_offset())
, log_(lgr) { , log_(lgr) {
// TODO: defaults? // TODO: defaults?
@ -122,7 +122,7 @@ class metadata_v2_ : public metadata_v2::impl {
if (S_ISREG(mode)) { if (S_ISREG(mode)) {
return reg_filesize(entry.inode()); return reg_filesize(entry.inode());
} else if (S_ISLNK(mode)) { } else if (S_ISLNK(mode)) {
return meta_.links()[meta_.dir_link_index()[entry.inode()]].size(); return meta_.links()[meta_.link_index()[entry.inode() - meta_.link_index_offset()]].size();
} else { } else {
return 0; return 0;
} }
@ -206,13 +206,9 @@ void metadata_v2_<LoggerPolicy>::dump(
// os << " " << filesize(entry, mode) << "\n"; // os << " " << filesize(entry, mode) << "\n";
// icb(indent + " ", de->inode); // icb(indent + " ", de->inode);
} else if (S_ISDIR(mode)) { } else if (S_ISDIR(mode)) {
auto dir_index = meta_.dir_link_index()[inode]; dump(os, indent + " ", meta_.directories()[inode], std::move(icb));
os << " => "
<< "<dir:" << dir_index << ">"
<< "\n";
dump(os, indent + " ", meta_.directories()[dir_index], std::move(icb));
} else if (S_ISLNK(mode)) { } else if (S_ISLNK(mode)) {
os << " -> " << meta_.links()[meta_.dir_link_index()[inode]] << "\n"; os << " -> " << meta_.links()[meta_.link_index()[inode] - meta_.link_index_offset()] << "\n";
} else { } else {
os << " (unknown type)\n"; os << " (unknown type)\n";
} }

View File

@ -242,13 +242,13 @@ scanner_<LoggerPolicy>::compress_names_table(
return offset; return offset;
} }
class set_inode_visitor : public entry_visitor { class dir_set_inode_visitor : public entry_visitor {
public: public:
void visit(file*) override { dir_set_inode_visitor(uint32_t& inode_no) : inode_no_(inode_no) {};
// nothing
}
void visit(link* p) override { p->set_inode(inode_no_++); } void visit(file*) override {}
void visit(link*) override {}
void visit(dir* p) override { void visit(dir* p) override {
p->sort(); p->sort();
@ -258,7 +258,21 @@ class set_inode_visitor : public entry_visitor {
uint32_t inode_no() const { return inode_no_; } uint32_t inode_no() const { return inode_no_; }
private: private:
uint32_t inode_no_ = 0; uint32_t& inode_no_;
};
class link_set_inode_visitor : public entry_visitor {
public:
link_set_inode_visitor(uint32_t& inode_no) : inode_no_(inode_no) {};
void visit(file*) override {}
void visit(link* p) override { p->set_inode(inode_no_++); }
void visit(dir*) override {}
private:
uint32_t& inode_no_;
}; };
class names_and_links_visitor : public entry_visitor { class names_and_links_visitor : public entry_visitor {
@ -299,10 +313,12 @@ class save_directories_visitor : public entry_visitor {
public: public:
save_directories_visitor(metadata_writer& mw, thrift::metadata::metadata& mv2, save_directories_visitor(metadata_writer& mw, thrift::metadata::metadata& mv2,
global_entry_data const& ge_data, global_entry_data const& ge_data,
std::vector<uint32_t>& dir_index,
std::vector<uint32_t>& index) std::vector<uint32_t>& index)
: mw_(mw) : mw_(mw)
, mv2_(mv2) , mv2_(mv2)
, ge_data_(ge_data) , ge_data_(ge_data)
, dir_index_(dir_index)
, cb_([&](const entry* e, size_t offset) { , cb_([&](const entry* e, size_t offset) {
index.at(e->inode_num()) = folly::to<uint32_t>(offset); index.at(e->inode_num()) = folly::to<uint32_t>(offset);
}) {} }) {}
@ -316,7 +332,7 @@ class save_directories_visitor : public entry_visitor {
} }
void visit(dir* p) override { void visit(dir* p) override {
mv2_.dir_link_index.at(p->inode_num()) = mv2_.directories.size(); dir_index_.at(p->inode_num()) = mv2_.directories.size();
p->pack(mv2_, ge_data_); p->pack(mv2_, ge_data_);
p->set_offset(mw_.offset()); p->set_offset(mw_.offset());
@ -333,6 +349,7 @@ class save_directories_visitor : public entry_visitor {
metadata_writer& mw_; metadata_writer& mw_;
thrift::metadata::metadata& mv2_; thrift::metadata::metadata& mv2_;
global_entry_data const& ge_data_; global_entry_data const& ge_data_;
std::vector<uint32_t>& dir_index_;
std::function<void(const entry* e, size_t offset)> cb_; std::function<void(const entry* e, size_t offset)> cb_;
}; };
@ -464,8 +481,12 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
log_.info() << "finding duplicate files..."; log_.info() << "finding duplicate files...";
set_inode_visitor siv; uint32_t first_link_inode = 0;
root->accept(siv, true); dir_set_inode_visitor dsiv(first_link_inode);
root->accept(dsiv, true);
uint32_t first_file_inode = first_link_inode;
link_set_inode_visitor lsiv(first_file_inode);
root->accept(lsiv, true);
auto im = inode_manager::create(cfg_.block_size_bits); auto im = inode_manager::create(cfg_.block_size_bits);
@ -525,14 +546,16 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
} }
log_.info() << "numbering file inodes..."; log_.info() << "numbering file inodes...";
im->number_inodes(siv.inode_no()); im->number_inodes(first_file_inode);
log_.info() << "building metadata..."; log_.info() << "building metadata...";
std::vector<uint8_t> metadata_vec; std::vector<uint8_t> metadata_vec;
metadata_writer mw(lgr_, metadata_vec); metadata_writer mw(lgr_, metadata_vec);
global_entry_data ge_data; global_entry_data ge_data(options_.no_time); // TODO: just pass options directly
thrift::metadata::metadata mv2; thrift::metadata::metadata mv2;
mv2.dir_link_index.resize(siv.inode_no()); std::vector<uint32_t> dir_index;
dir_index.resize(first_link_inode);
mv2.link_index.resize(first_file_inode - first_link_inode);
wg_.add_job([&] { wg_.add_job([&] {
mw.start_section(section_type::META_TABLEDATA); mw.start_section(section_type::META_TABLEDATA);
@ -554,7 +577,7 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
root->walk([&](entry* ep) { root->walk([&](entry* ep) {
ep->update(ge_data); ep->update(ge_data);
if (auto lp = dynamic_cast<link*>(ep)) { if (auto lp = dynamic_cast<link*>(ep)) {
mv2.dir_link_index.at(ep->inode_num()) = mv2.link_index.at(ep->inode_num() - first_link_inode) =
ge_data.get_link_index(lp->linkname()); ge_data.get_link_index(lp->linkname());
} }
if (ep->has_parent()) { if (ep->has_parent()) {
@ -612,8 +635,8 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
// submitted for compression // submitted for compression
mw.align(im->chunk_size()); mw.align(im->chunk_size());
im->for_each_inode([&](std::shared_ptr<inode> const& ino) { im->for_each_inode([&](std::shared_ptr<inode> const& ino) {
index.at(ino->num() - siv.inode_no()) = folly::to<uint32_t>(mw.offset()); index.at(ino->num() - first_file_inode) = folly::to<uint32_t>(mw.offset());
mv2.chunk_index.at(ino->num() - siv.inode_no()) = mv2.chunks.size(); mv2.chunk_index.at(ino->num() - first_file_inode) = mv2.chunks.size();
mw.write(ino->chunks()); mw.write(ino->chunks());
ino->append_chunks(mv2.chunks); ino->append_chunks(mv2.chunks);
}); });
@ -635,10 +658,10 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
mw.finish_section(); mw.finish_section();
log_.info() << "saving directories..."; log_.info() << "saving directories...";
index.resize(siv.inode_no() + im->count()); index.resize(first_file_inode + im->count());
mv2.inode_index.resize(siv.inode_no() + im->count()); mv2.entry_index.resize(first_file_inode + im->count());
mw.start_section(section_type::META_DIRECTORIES); mw.start_section(section_type::META_DIRECTORIES);
save_directories_visitor sdv(mw, mv2, ge_data, index); save_directories_visitor sdv(mw, mv2, ge_data, dir_index, index);
root->accept(sdv); root->accept(sdv);
mw.finish_section(); mw.finish_section();
@ -653,22 +676,30 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
mconf.block_size_bits = folly::to<uint8_t>(im->block_size_bits()); mconf.block_size_bits = folly::to<uint8_t>(im->block_size_bits());
mconf.de_type = entry_->de_type(); mconf.de_type = entry_->de_type();
mconf.unused = 0; mconf.unused = 0;
mconf.inode_count = siv.inode_no() + im->count(); mconf.inode_count = first_file_inode + im->count();
mconf.orig_fs_size = prog.original_size; mconf.orig_fs_size = prog.original_size;
mconf.chunk_index_offset = siv.inode_no(); mconf.chunk_index_offset = first_file_inode;
mconf.inode_index_offset = 0; mconf.inode_index_offset = 0;
mw.write(mconf); mw.write(mconf);
mw.finish_section(); mw.finish_section();
fsw.write_metadata(std::move(metadata_vec)); fsw.write_metadata(std::move(metadata_vec));
{
std::vector<thrift::metadata::directory> tmp = std::move(mv2.directories);
mv2.directories.reserve(tmp.size());
for (auto i : dir_index) {
mv2.directories.push_back(std::move(tmp[i]));
}
}
mv2.uids = ge_data.get_uids(); mv2.uids = ge_data.get_uids();
mv2.gids = ge_data.get_gids(); mv2.gids = ge_data.get_gids();
mv2.modes = ge_data.get_modes(); mv2.modes = ge_data.get_modes();
mv2.names = ge_data.get_names(); mv2.names = ge_data.get_names();
mv2.links = ge_data.get_links(); mv2.links = ge_data.get_links();
mv2.timestamp_base = ge_data.timestamp_base; mv2.timestamp_base = ge_data.timestamp_base;
mv2.chunk_index_offset = siv.inode_no(); mv2.chunk_index_offset = first_file_inode;
mv2.total_fs_size = prog.original_size; mv2.total_fs_size = prog.original_size;
fsw.write_metadata_v2(freeze_to_buffer(mv2)); fsw.write_metadata_v2(freeze_to_buffer(mv2));

View File

@ -423,6 +423,8 @@ int mkdwarfs(int argc, char** argv) {
wg_writer.wait(); wg_writer.wait();
ti << "filesystem rewritten"; ti << "filesystem rewritten";
} else { } else {
options.no_time = no_time;
scanner s(lgr, wg_scanner, cfg, scanner s(lgr, wg_scanner, cfg,
entry_factory::create(no_owner, no_owner || no_time, entry_factory::create(no_owner, no_owner || no_time,
options.file_order == options.file_order ==

View File

@ -25,43 +25,119 @@ typedef i16 (cpp2.type = "uint16_t") UInt16
typedef i32 (cpp2.type = "uint32_t") UInt32 typedef i32 (cpp2.type = "uint32_t") UInt32
typedef i64 (cpp2.type = "uint64_t") UInt64 typedef i64 (cpp2.type = "uint64_t") UInt64
/**
* One chunk of data. A single file can be composed of multiple
* chunks. Chunks may be overlapping if there is identical data
* in different files.
*/
struct chunk { struct chunk {
1: required UInt32 block, 1: required UInt32 block,
2: required UInt32 offset, 2: required UInt32 offset,
3: required UInt32 size, 3: required UInt32 size,
} }
/**
* One directory. This contains only a link to its parent inode
* and a range of `entry` objects that can be looked up in
* `metadata.entries`.
*/
struct directory { struct directory {
1: required UInt32 self_inode, 1: required UInt32 parent_inode,
2: required UInt32 parent_inode, 2: required UInt32 first_entry,
3: required UInt32 first_entry, 3: required UInt32 entry_count,
4: required UInt32 entry_count,
} }
/**
* One entry. This can be files, directories or links. This is
* by far the most common metadata object type, so it has been
* optimized for size.
*/
struct entry { struct entry {
// index into metadata.names
1: required UInt32 name_index, 1: required UInt32 name_index,
// index into metadata.modes
2: required UInt16 mode, 2: required UInt16 mode,
/**
* Inode number. Can be used in different ways:
*
* - For directories, the inode can be used as an index into
* metadata.directories.
* - For links, (inode - metadata.link_index_offset) can be
* used as an index into metadata.links.
* - For files, (inode - metadata.chunk_index_offset) can be
* used as in index into metadata.chunk_index.
*/
3: required UInt32 inode, 3: required UInt32 inode,
// index into metadata.uids
4: required UInt16 owner, 4: required UInt16 owner,
// index into metadata.gids
5: required UInt16 group, 5: required UInt16 group,
// atime relative to metadata.timestamp_base
6: required UInt64 atime, 6: required UInt64 atime,
// mtime relative to metadata.timestamp_base
7: required UInt64 mtime, 7: required UInt64 mtime,
// ctime relative to metadata.timestamp_base
8: required UInt64 ctime, 8: required UInt64 ctime,
} }
struct metadata { struct metadata {
/**
* Ranges of chunks that make up regular files. Identical
* files share the same inode number. The range of chunks
* for a * regular file inode are:
*
* chunks[chunk_index[inode]] .. chunks[chunk_index[inode + 1] - 1]
*/
1: required list<chunk> chunks, 1: required list<chunk> chunks,
2: required list<UInt32> chunk_index,
3: required list<directory> directories, // all directories, indexed by inode number
4: required list<entry> entries, 2: required list<directory> directories,
5: required list<UInt32> inode_index,
6: required list<UInt32> dir_link_index, // all entries, can be looked up by inode through entry_index
3: required list<entry> entries,
// chunk index, indexed by (inode - chunk_index_offset); this
// list has one extra item at the back that points to the end
// of `chunks`, so chunk lookups work the same for all inodes
4: required list<UInt32> chunk_index,
// entry index, indexed by inode
5: required list<UInt32> entry_index,
// link index, indexed by (inode - link_index_offset)
6: required list<UInt32> link_index,
// user ids, for lookup by index in entry.owner
7: required list<UInt16> uids, 7: required list<UInt16> uids,
// group ids, for lookup by index in entry.group
8: required list<UInt16> gids, 8: required list<UInt16> gids,
// entry modes, for lookup by index in entry.mode
9: required list<UInt16> modes, 9: required list<UInt16> modes,
// entry names, for lookup by index in entry.name_index
10: required list<string> names, 10: required list<string> names,
// link targets, for lookup by index from link_index
11: required list<string> links, 11: required list<string> links,
// timestamp base for all entry timestamps
12: required UInt64 timestamp_base, 12: required UInt64 timestamp_base,
// inode offset for lookups into chunk_index
13: required UInt32 chunk_index_offset; 13: required UInt32 chunk_index_offset;
14: required UInt64 total_fs_size;
// inode offset for lookups into link_index
14: required UInt32 link_index_offset;
// total file system size
15: required UInt64 total_fs_size;
} }