More metadata tweaks, also document thrift metadata

This commit is contained in:
Marcus Holland-Moritz 2020-11-26 22:17:29 +01:00
parent f373144b73
commit 603e2c7ca3
7 changed files with 151 additions and 43 deletions

View File

@ -39,6 +39,8 @@
namespace dwarfs {
struct global_entry_data {
global_entry_data(bool no_time) : no_time_(no_time) {}
void add_uid(uint16_t uid) { add(uid, uids, next_uid_index); }
void add_gid(uint16_t gid) { add(gid, gids, next_gid_index); }
@ -84,7 +86,7 @@ struct global_entry_data {
}
uint64_t get_time_offset(uint64_t time) const {
return time - timestamp_base;
return no_time_ ? 0 : time - timestamp_base;
}
std::vector<uint16_t> get_uids() const;
@ -110,6 +112,7 @@ struct global_entry_data {
uint16_t next_gid_index{0};
uint16_t next_mode_index{0};
uint64_t timestamp_base{std::numeric_limits<uint64_t>::max()};
bool no_time_;
};
class file;

View File

@ -37,5 +37,6 @@ std::ostream& operator<<(std::ostream& os, file_order_mode mode);
struct scanner_options {
file_order_mode file_order;
bool no_time;
};
} // namespace dwarfs

View File

@ -89,7 +89,7 @@ class dir_ : public dir {
void pack_entry(thrift::metadata::metadata& mv2,
global_entry_data const& data) const override {
mv2.inode_index.at(inode_num()) = mv2.entries.size();
mv2.entry_index.at(inode_num()) = mv2.entries.size();
mv2.entries.emplace_back();
entry::pack(mv2.entries.back(), data);
}
@ -121,7 +121,6 @@ class dir_ : public dir {
void pack(thrift::metadata::metadata& mv2,
global_entry_data const& data) const override {
thrift::metadata::directory dir;
dir.self_inode = inode_num();
dir.parent_inode =
has_parent() ? std::dynamic_pointer_cast<dir_>(parent())->inode_num()
: 0;
@ -129,7 +128,7 @@ class dir_ : public dir {
dir.entry_count = entries_.size();
mv2.directories.push_back(dir);
for (entry_ptr const& e : entries_) {
mv2.inode_index.at(e->inode_num()) = mv2.entries.size();
mv2.entry_index.at(e->inode_num()) = mv2.entries.size();
mv2.entries.emplace_back();
e->pack(mv2.entries.back(), data);
}

View File

@ -51,7 +51,7 @@ class metadata_v2_ : public metadata_v2::impl {
: data_(std::move(meta))
, meta_(::apache::thrift::frozen::mapFrozen<thrift::metadata::metadata>(
data_))
, root_(meta_.entries()[meta_.inode_index()[0]])
, root_(meta_.entries()[meta_.entry_index()[0]])
, inode_offset_(meta_.chunk_index_offset())
, log_(lgr) {
// TODO: defaults?
@ -122,7 +122,7 @@ class metadata_v2_ : public metadata_v2::impl {
if (S_ISREG(mode)) {
return reg_filesize(entry.inode());
} else if (S_ISLNK(mode)) {
return meta_.links()[meta_.dir_link_index()[entry.inode()]].size();
return meta_.links()[meta_.link_index()[entry.inode() - meta_.link_index_offset()]].size();
} else {
return 0;
}
@ -206,13 +206,9 @@ void metadata_v2_<LoggerPolicy>::dump(
// os << " " << filesize(entry, mode) << "\n";
// icb(indent + " ", de->inode);
} else if (S_ISDIR(mode)) {
auto dir_index = meta_.dir_link_index()[inode];
os << " => "
<< "<dir:" << dir_index << ">"
<< "\n";
dump(os, indent + " ", meta_.directories()[dir_index], std::move(icb));
dump(os, indent + " ", meta_.directories()[inode], std::move(icb));
} else if (S_ISLNK(mode)) {
os << " -> " << meta_.links()[meta_.dir_link_index()[inode]] << "\n";
os << " -> " << meta_.links()[meta_.link_index()[inode] - meta_.link_index_offset()] << "\n";
} else {
os << " (unknown type)\n";
}

View File

@ -242,13 +242,13 @@ scanner_<LoggerPolicy>::compress_names_table(
return offset;
}
class set_inode_visitor : public entry_visitor {
class dir_set_inode_visitor : public entry_visitor {
public:
void visit(file*) override {
// nothing
}
dir_set_inode_visitor(uint32_t& inode_no) : inode_no_(inode_no) {};
void visit(link* p) override { p->set_inode(inode_no_++); }
void visit(file*) override {}
void visit(link*) override {}
void visit(dir* p) override {
p->sort();
@ -258,7 +258,21 @@ class set_inode_visitor : public entry_visitor {
uint32_t inode_no() const { return inode_no_; }
private:
uint32_t inode_no_ = 0;
uint32_t& inode_no_;
};
class link_set_inode_visitor : public entry_visitor {
public:
link_set_inode_visitor(uint32_t& inode_no) : inode_no_(inode_no) {};
void visit(file*) override {}
void visit(link* p) override { p->set_inode(inode_no_++); }
void visit(dir*) override {}
private:
uint32_t& inode_no_;
};
class names_and_links_visitor : public entry_visitor {
@ -299,10 +313,12 @@ class save_directories_visitor : public entry_visitor {
public:
save_directories_visitor(metadata_writer& mw, thrift::metadata::metadata& mv2,
global_entry_data const& ge_data,
std::vector<uint32_t>& dir_index,
std::vector<uint32_t>& index)
: mw_(mw)
, mv2_(mv2)
, ge_data_(ge_data)
, dir_index_(dir_index)
, cb_([&](const entry* e, size_t offset) {
index.at(e->inode_num()) = folly::to<uint32_t>(offset);
}) {}
@ -316,7 +332,7 @@ class save_directories_visitor : public entry_visitor {
}
void visit(dir* p) override {
mv2_.dir_link_index.at(p->inode_num()) = mv2_.directories.size();
dir_index_.at(p->inode_num()) = mv2_.directories.size();
p->pack(mv2_, ge_data_);
p->set_offset(mw_.offset());
@ -333,6 +349,7 @@ class save_directories_visitor : public entry_visitor {
metadata_writer& mw_;
thrift::metadata::metadata& mv2_;
global_entry_data const& ge_data_;
std::vector<uint32_t>& dir_index_;
std::function<void(const entry* e, size_t offset)> cb_;
};
@ -464,8 +481,12 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
log_.info() << "finding duplicate files...";
set_inode_visitor siv;
root->accept(siv, true);
uint32_t first_link_inode = 0;
dir_set_inode_visitor dsiv(first_link_inode);
root->accept(dsiv, true);
uint32_t first_file_inode = first_link_inode;
link_set_inode_visitor lsiv(first_file_inode);
root->accept(lsiv, true);
auto im = inode_manager::create(cfg_.block_size_bits);
@ -525,14 +546,16 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
}
log_.info() << "numbering file inodes...";
im->number_inodes(siv.inode_no());
im->number_inodes(first_file_inode);
log_.info() << "building metadata...";
std::vector<uint8_t> metadata_vec;
metadata_writer mw(lgr_, metadata_vec);
global_entry_data ge_data;
global_entry_data ge_data(options_.no_time); // TODO: just pass options directly
thrift::metadata::metadata mv2;
mv2.dir_link_index.resize(siv.inode_no());
std::vector<uint32_t> dir_index;
dir_index.resize(first_link_inode);
mv2.link_index.resize(first_file_inode - first_link_inode);
wg_.add_job([&] {
mw.start_section(section_type::META_TABLEDATA);
@ -554,7 +577,7 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
root->walk([&](entry* ep) {
ep->update(ge_data);
if (auto lp = dynamic_cast<link*>(ep)) {
mv2.dir_link_index.at(ep->inode_num()) =
mv2.link_index.at(ep->inode_num() - first_link_inode) =
ge_data.get_link_index(lp->linkname());
}
if (ep->has_parent()) {
@ -612,8 +635,8 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
// submitted for compression
mw.align(im->chunk_size());
im->for_each_inode([&](std::shared_ptr<inode> const& ino) {
index.at(ino->num() - siv.inode_no()) = folly::to<uint32_t>(mw.offset());
mv2.chunk_index.at(ino->num() - siv.inode_no()) = mv2.chunks.size();
index.at(ino->num() - first_file_inode) = folly::to<uint32_t>(mw.offset());
mv2.chunk_index.at(ino->num() - first_file_inode) = mv2.chunks.size();
mw.write(ino->chunks());
ino->append_chunks(mv2.chunks);
});
@ -635,10 +658,10 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
mw.finish_section();
log_.info() << "saving directories...";
index.resize(siv.inode_no() + im->count());
mv2.inode_index.resize(siv.inode_no() + im->count());
index.resize(first_file_inode + im->count());
mv2.entry_index.resize(first_file_inode + im->count());
mw.start_section(section_type::META_DIRECTORIES);
save_directories_visitor sdv(mw, mv2, ge_data, index);
save_directories_visitor sdv(mw, mv2, ge_data, dir_index, index);
root->accept(sdv);
mw.finish_section();
@ -653,22 +676,30 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
mconf.block_size_bits = folly::to<uint8_t>(im->block_size_bits());
mconf.de_type = entry_->de_type();
mconf.unused = 0;
mconf.inode_count = siv.inode_no() + im->count();
mconf.inode_count = first_file_inode + im->count();
mconf.orig_fs_size = prog.original_size;
mconf.chunk_index_offset = siv.inode_no();
mconf.chunk_index_offset = first_file_inode;
mconf.inode_index_offset = 0;
mw.write(mconf);
mw.finish_section();
fsw.write_metadata(std::move(metadata_vec));
{
std::vector<thrift::metadata::directory> tmp = std::move(mv2.directories);
mv2.directories.reserve(tmp.size());
for (auto i : dir_index) {
mv2.directories.push_back(std::move(tmp[i]));
}
}
mv2.uids = ge_data.get_uids();
mv2.gids = ge_data.get_gids();
mv2.modes = ge_data.get_modes();
mv2.names = ge_data.get_names();
mv2.links = ge_data.get_links();
mv2.timestamp_base = ge_data.timestamp_base;
mv2.chunk_index_offset = siv.inode_no();
mv2.chunk_index_offset = first_file_inode;
mv2.total_fs_size = prog.original_size;
fsw.write_metadata_v2(freeze_to_buffer(mv2));

View File

@ -423,6 +423,8 @@ int mkdwarfs(int argc, char** argv) {
wg_writer.wait();
ti << "filesystem rewritten";
} else {
options.no_time = no_time;
scanner s(lgr, wg_scanner, cfg,
entry_factory::create(no_owner, no_owner || no_time,
options.file_order ==

View File

@ -25,43 +25,119 @@ typedef i16 (cpp2.type = "uint16_t") UInt16
typedef i32 (cpp2.type = "uint32_t") UInt32
typedef i64 (cpp2.type = "uint64_t") UInt64
/**
* One chunk of data. A single file can be composed of multiple
* chunks. Chunks may be overlapping if there is identical data
* in different files.
*/
struct chunk {
1: required UInt32 block,
2: required UInt32 offset,
3: required UInt32 size,
}
/**
* One directory. This contains only a link to its parent inode
* and a range of `entry` objects that can be looked up in
* `metadata.entries`.
*/
struct directory {
1: required UInt32 self_inode,
2: required UInt32 parent_inode,
3: required UInt32 first_entry,
4: required UInt32 entry_count,
1: required UInt32 parent_inode,
2: required UInt32 first_entry,
3: required UInt32 entry_count,
}
/**
* One entry. This can be files, directories or links. This is
* by far the most common metadata object type, so it has been
* optimized for size.
*/
struct entry {
// index into metadata.names
1: required UInt32 name_index,
// index into metadata.modes
2: required UInt16 mode,
/**
* Inode number. Can be used in different ways:
*
* - For directories, the inode can be used as an index into
* metadata.directories.
* - For links, (inode - metadata.link_index_offset) can be
* used as an index into metadata.links.
* - For files, (inode - metadata.chunk_index_offset) can be
* used as in index into metadata.chunk_index.
*/
3: required UInt32 inode,
// index into metadata.uids
4: required UInt16 owner,
// index into metadata.gids
5: required UInt16 group,
// atime relative to metadata.timestamp_base
6: required UInt64 atime,
// mtime relative to metadata.timestamp_base
7: required UInt64 mtime,
// ctime relative to metadata.timestamp_base
8: required UInt64 ctime,
}
struct metadata {
/**
* Ranges of chunks that make up regular files. Identical
* files share the same inode number. The range of chunks
* for a * regular file inode are:
*
* chunks[chunk_index[inode]] .. chunks[chunk_index[inode + 1] - 1]
*/
1: required list<chunk> chunks,
2: required list<UInt32> chunk_index,
3: required list<directory> directories,
4: required list<entry> entries,
5: required list<UInt32> inode_index,
6: required list<UInt32> dir_link_index,
// all directories, indexed by inode number
2: required list<directory> directories,
// all entries, can be looked up by inode through entry_index
3: required list<entry> entries,
// chunk index, indexed by (inode - chunk_index_offset); this
// list has one extra item at the back that points to the end
// of `chunks`, so chunk lookups work the same for all inodes
4: required list<UInt32> chunk_index,
// entry index, indexed by inode
5: required list<UInt32> entry_index,
// link index, indexed by (inode - link_index_offset)
6: required list<UInt32> link_index,
// user ids, for lookup by index in entry.owner
7: required list<UInt16> uids,
// group ids, for lookup by index in entry.group
8: required list<UInt16> gids,
// entry modes, for lookup by index in entry.mode
9: required list<UInt16> modes,
// entry names, for lookup by index in entry.name_index
10: required list<string> names,
// link targets, for lookup by index from link_index
11: required list<string> links,
// timestamp base for all entry timestamps
12: required UInt64 timestamp_base,
// inode offset for lookups into chunk_index
13: required UInt32 chunk_index_offset;
14: required UInt64 total_fs_size;
// inode offset for lookups into link_index
14: required UInt32 link_index_offset;
// total file system size
15: required UInt64 total_fs_size;
}