mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-08 20:12:56 -04:00
More metadata tweaks, also document thrift metadata
This commit is contained in:
parent
f373144b73
commit
603e2c7ca3
@ -39,6 +39,8 @@
|
||||
namespace dwarfs {
|
||||
|
||||
struct global_entry_data {
|
||||
global_entry_data(bool no_time) : no_time_(no_time) {}
|
||||
|
||||
void add_uid(uint16_t uid) { add(uid, uids, next_uid_index); }
|
||||
|
||||
void add_gid(uint16_t gid) { add(gid, gids, next_gid_index); }
|
||||
@ -84,7 +86,7 @@ struct global_entry_data {
|
||||
}
|
||||
|
||||
uint64_t get_time_offset(uint64_t time) const {
|
||||
return time - timestamp_base;
|
||||
return no_time_ ? 0 : time - timestamp_base;
|
||||
}
|
||||
|
||||
std::vector<uint16_t> get_uids() const;
|
||||
@ -110,6 +112,7 @@ struct global_entry_data {
|
||||
uint16_t next_gid_index{0};
|
||||
uint16_t next_mode_index{0};
|
||||
uint64_t timestamp_base{std::numeric_limits<uint64_t>::max()};
|
||||
bool no_time_;
|
||||
};
|
||||
|
||||
class file;
|
||||
|
@ -37,5 +37,6 @@ std::ostream& operator<<(std::ostream& os, file_order_mode mode);
|
||||
|
||||
struct scanner_options {
|
||||
file_order_mode file_order;
|
||||
bool no_time;
|
||||
};
|
||||
} // namespace dwarfs
|
||||
|
@ -89,7 +89,7 @@ class dir_ : public dir {
|
||||
|
||||
void pack_entry(thrift::metadata::metadata& mv2,
|
||||
global_entry_data const& data) const override {
|
||||
mv2.inode_index.at(inode_num()) = mv2.entries.size();
|
||||
mv2.entry_index.at(inode_num()) = mv2.entries.size();
|
||||
mv2.entries.emplace_back();
|
||||
entry::pack(mv2.entries.back(), data);
|
||||
}
|
||||
@ -121,7 +121,6 @@ class dir_ : public dir {
|
||||
void pack(thrift::metadata::metadata& mv2,
|
||||
global_entry_data const& data) const override {
|
||||
thrift::metadata::directory dir;
|
||||
dir.self_inode = inode_num();
|
||||
dir.parent_inode =
|
||||
has_parent() ? std::dynamic_pointer_cast<dir_>(parent())->inode_num()
|
||||
: 0;
|
||||
@ -129,7 +128,7 @@ class dir_ : public dir {
|
||||
dir.entry_count = entries_.size();
|
||||
mv2.directories.push_back(dir);
|
||||
for (entry_ptr const& e : entries_) {
|
||||
mv2.inode_index.at(e->inode_num()) = mv2.entries.size();
|
||||
mv2.entry_index.at(e->inode_num()) = mv2.entries.size();
|
||||
mv2.entries.emplace_back();
|
||||
e->pack(mv2.entries.back(), data);
|
||||
}
|
||||
|
@ -51,7 +51,7 @@ class metadata_v2_ : public metadata_v2::impl {
|
||||
: data_(std::move(meta))
|
||||
, meta_(::apache::thrift::frozen::mapFrozen<thrift::metadata::metadata>(
|
||||
data_))
|
||||
, root_(meta_.entries()[meta_.inode_index()[0]])
|
||||
, root_(meta_.entries()[meta_.entry_index()[0]])
|
||||
, inode_offset_(meta_.chunk_index_offset())
|
||||
, log_(lgr) {
|
||||
// TODO: defaults?
|
||||
@ -122,7 +122,7 @@ class metadata_v2_ : public metadata_v2::impl {
|
||||
if (S_ISREG(mode)) {
|
||||
return reg_filesize(entry.inode());
|
||||
} else if (S_ISLNK(mode)) {
|
||||
return meta_.links()[meta_.dir_link_index()[entry.inode()]].size();
|
||||
return meta_.links()[meta_.link_index()[entry.inode() - meta_.link_index_offset()]].size();
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
@ -206,13 +206,9 @@ void metadata_v2_<LoggerPolicy>::dump(
|
||||
// os << " " << filesize(entry, mode) << "\n";
|
||||
// icb(indent + " ", de->inode);
|
||||
} else if (S_ISDIR(mode)) {
|
||||
auto dir_index = meta_.dir_link_index()[inode];
|
||||
os << " => "
|
||||
<< "<dir:" << dir_index << ">"
|
||||
<< "\n";
|
||||
dump(os, indent + " ", meta_.directories()[dir_index], std::move(icb));
|
||||
dump(os, indent + " ", meta_.directories()[inode], std::move(icb));
|
||||
} else if (S_ISLNK(mode)) {
|
||||
os << " -> " << meta_.links()[meta_.dir_link_index()[inode]] << "\n";
|
||||
os << " -> " << meta_.links()[meta_.link_index()[inode] - meta_.link_index_offset()] << "\n";
|
||||
} else {
|
||||
os << " (unknown type)\n";
|
||||
}
|
||||
|
@ -242,13 +242,13 @@ scanner_<LoggerPolicy>::compress_names_table(
|
||||
return offset;
|
||||
}
|
||||
|
||||
class set_inode_visitor : public entry_visitor {
|
||||
class dir_set_inode_visitor : public entry_visitor {
|
||||
public:
|
||||
void visit(file*) override {
|
||||
// nothing
|
||||
}
|
||||
dir_set_inode_visitor(uint32_t& inode_no) : inode_no_(inode_no) {};
|
||||
|
||||
void visit(link* p) override { p->set_inode(inode_no_++); }
|
||||
void visit(file*) override {}
|
||||
|
||||
void visit(link*) override {}
|
||||
|
||||
void visit(dir* p) override {
|
||||
p->sort();
|
||||
@ -258,7 +258,21 @@ class set_inode_visitor : public entry_visitor {
|
||||
uint32_t inode_no() const { return inode_no_; }
|
||||
|
||||
private:
|
||||
uint32_t inode_no_ = 0;
|
||||
uint32_t& inode_no_;
|
||||
};
|
||||
|
||||
class link_set_inode_visitor : public entry_visitor {
|
||||
public:
|
||||
link_set_inode_visitor(uint32_t& inode_no) : inode_no_(inode_no) {};
|
||||
|
||||
void visit(file*) override {}
|
||||
|
||||
void visit(link* p) override { p->set_inode(inode_no_++); }
|
||||
|
||||
void visit(dir*) override {}
|
||||
|
||||
private:
|
||||
uint32_t& inode_no_;
|
||||
};
|
||||
|
||||
class names_and_links_visitor : public entry_visitor {
|
||||
@ -299,10 +313,12 @@ class save_directories_visitor : public entry_visitor {
|
||||
public:
|
||||
save_directories_visitor(metadata_writer& mw, thrift::metadata::metadata& mv2,
|
||||
global_entry_data const& ge_data,
|
||||
std::vector<uint32_t>& dir_index,
|
||||
std::vector<uint32_t>& index)
|
||||
: mw_(mw)
|
||||
, mv2_(mv2)
|
||||
, ge_data_(ge_data)
|
||||
, dir_index_(dir_index)
|
||||
, cb_([&](const entry* e, size_t offset) {
|
||||
index.at(e->inode_num()) = folly::to<uint32_t>(offset);
|
||||
}) {}
|
||||
@ -316,7 +332,7 @@ class save_directories_visitor : public entry_visitor {
|
||||
}
|
||||
|
||||
void visit(dir* p) override {
|
||||
mv2_.dir_link_index.at(p->inode_num()) = mv2_.directories.size();
|
||||
dir_index_.at(p->inode_num()) = mv2_.directories.size();
|
||||
p->pack(mv2_, ge_data_);
|
||||
|
||||
p->set_offset(mw_.offset());
|
||||
@ -333,6 +349,7 @@ class save_directories_visitor : public entry_visitor {
|
||||
metadata_writer& mw_;
|
||||
thrift::metadata::metadata& mv2_;
|
||||
global_entry_data const& ge_data_;
|
||||
std::vector<uint32_t>& dir_index_;
|
||||
std::function<void(const entry* e, size_t offset)> cb_;
|
||||
};
|
||||
|
||||
@ -464,8 +481,12 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
||||
|
||||
log_.info() << "finding duplicate files...";
|
||||
|
||||
set_inode_visitor siv;
|
||||
root->accept(siv, true);
|
||||
uint32_t first_link_inode = 0;
|
||||
dir_set_inode_visitor dsiv(first_link_inode);
|
||||
root->accept(dsiv, true);
|
||||
uint32_t first_file_inode = first_link_inode;
|
||||
link_set_inode_visitor lsiv(first_file_inode);
|
||||
root->accept(lsiv, true);
|
||||
|
||||
auto im = inode_manager::create(cfg_.block_size_bits);
|
||||
|
||||
@ -525,14 +546,16 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
||||
}
|
||||
|
||||
log_.info() << "numbering file inodes...";
|
||||
im->number_inodes(siv.inode_no());
|
||||
im->number_inodes(first_file_inode);
|
||||
|
||||
log_.info() << "building metadata...";
|
||||
std::vector<uint8_t> metadata_vec;
|
||||
metadata_writer mw(lgr_, metadata_vec);
|
||||
global_entry_data ge_data;
|
||||
global_entry_data ge_data(options_.no_time); // TODO: just pass options directly
|
||||
thrift::metadata::metadata mv2;
|
||||
mv2.dir_link_index.resize(siv.inode_no());
|
||||
std::vector<uint32_t> dir_index;
|
||||
dir_index.resize(first_link_inode);
|
||||
mv2.link_index.resize(first_file_inode - first_link_inode);
|
||||
|
||||
wg_.add_job([&] {
|
||||
mw.start_section(section_type::META_TABLEDATA);
|
||||
@ -554,7 +577,7 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
||||
root->walk([&](entry* ep) {
|
||||
ep->update(ge_data);
|
||||
if (auto lp = dynamic_cast<link*>(ep)) {
|
||||
mv2.dir_link_index.at(ep->inode_num()) =
|
||||
mv2.link_index.at(ep->inode_num() - first_link_inode) =
|
||||
ge_data.get_link_index(lp->linkname());
|
||||
}
|
||||
if (ep->has_parent()) {
|
||||
@ -612,8 +635,8 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
||||
// submitted for compression
|
||||
mw.align(im->chunk_size());
|
||||
im->for_each_inode([&](std::shared_ptr<inode> const& ino) {
|
||||
index.at(ino->num() - siv.inode_no()) = folly::to<uint32_t>(mw.offset());
|
||||
mv2.chunk_index.at(ino->num() - siv.inode_no()) = mv2.chunks.size();
|
||||
index.at(ino->num() - first_file_inode) = folly::to<uint32_t>(mw.offset());
|
||||
mv2.chunk_index.at(ino->num() - first_file_inode) = mv2.chunks.size();
|
||||
mw.write(ino->chunks());
|
||||
ino->append_chunks(mv2.chunks);
|
||||
});
|
||||
@ -635,10 +658,10 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
||||
mw.finish_section();
|
||||
|
||||
log_.info() << "saving directories...";
|
||||
index.resize(siv.inode_no() + im->count());
|
||||
mv2.inode_index.resize(siv.inode_no() + im->count());
|
||||
index.resize(first_file_inode + im->count());
|
||||
mv2.entry_index.resize(first_file_inode + im->count());
|
||||
mw.start_section(section_type::META_DIRECTORIES);
|
||||
save_directories_visitor sdv(mw, mv2, ge_data, index);
|
||||
save_directories_visitor sdv(mw, mv2, ge_data, dir_index, index);
|
||||
root->accept(sdv);
|
||||
mw.finish_section();
|
||||
|
||||
@ -653,22 +676,30 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
||||
mconf.block_size_bits = folly::to<uint8_t>(im->block_size_bits());
|
||||
mconf.de_type = entry_->de_type();
|
||||
mconf.unused = 0;
|
||||
mconf.inode_count = siv.inode_no() + im->count();
|
||||
mconf.inode_count = first_file_inode + im->count();
|
||||
mconf.orig_fs_size = prog.original_size;
|
||||
mconf.chunk_index_offset = siv.inode_no();
|
||||
mconf.chunk_index_offset = first_file_inode;
|
||||
mconf.inode_index_offset = 0;
|
||||
mw.write(mconf);
|
||||
mw.finish_section();
|
||||
|
||||
fsw.write_metadata(std::move(metadata_vec));
|
||||
|
||||
{
|
||||
std::vector<thrift::metadata::directory> tmp = std::move(mv2.directories);
|
||||
mv2.directories.reserve(tmp.size());
|
||||
for (auto i : dir_index) {
|
||||
mv2.directories.push_back(std::move(tmp[i]));
|
||||
}
|
||||
}
|
||||
|
||||
mv2.uids = ge_data.get_uids();
|
||||
mv2.gids = ge_data.get_gids();
|
||||
mv2.modes = ge_data.get_modes();
|
||||
mv2.names = ge_data.get_names();
|
||||
mv2.links = ge_data.get_links();
|
||||
mv2.timestamp_base = ge_data.timestamp_base;
|
||||
mv2.chunk_index_offset = siv.inode_no();
|
||||
mv2.chunk_index_offset = first_file_inode;
|
||||
mv2.total_fs_size = prog.original_size;
|
||||
|
||||
fsw.write_metadata_v2(freeze_to_buffer(mv2));
|
||||
|
@ -423,6 +423,8 @@ int mkdwarfs(int argc, char** argv) {
|
||||
wg_writer.wait();
|
||||
ti << "filesystem rewritten";
|
||||
} else {
|
||||
options.no_time = no_time;
|
||||
|
||||
scanner s(lgr, wg_scanner, cfg,
|
||||
entry_factory::create(no_owner, no_owner || no_time,
|
||||
options.file_order ==
|
||||
|
@ -25,43 +25,119 @@ typedef i16 (cpp2.type = "uint16_t") UInt16
|
||||
typedef i32 (cpp2.type = "uint32_t") UInt32
|
||||
typedef i64 (cpp2.type = "uint64_t") UInt64
|
||||
|
||||
/**
|
||||
* One chunk of data. A single file can be composed of multiple
|
||||
* chunks. Chunks may be overlapping if there is identical data
|
||||
* in different files.
|
||||
*/
|
||||
struct chunk {
|
||||
1: required UInt32 block,
|
||||
2: required UInt32 offset,
|
||||
3: required UInt32 size,
|
||||
}
|
||||
|
||||
/**
|
||||
* One directory. This contains only a link to its parent inode
|
||||
* and a range of `entry` objects that can be looked up in
|
||||
* `metadata.entries`.
|
||||
*/
|
||||
struct directory {
|
||||
1: required UInt32 self_inode,
|
||||
2: required UInt32 parent_inode,
|
||||
3: required UInt32 first_entry,
|
||||
4: required UInt32 entry_count,
|
||||
1: required UInt32 parent_inode,
|
||||
2: required UInt32 first_entry,
|
||||
3: required UInt32 entry_count,
|
||||
}
|
||||
|
||||
/**
|
||||
* One entry. This can be files, directories or links. This is
|
||||
* by far the most common metadata object type, so it has been
|
||||
* optimized for size.
|
||||
*/
|
||||
struct entry {
|
||||
// index into metadata.names
|
||||
1: required UInt32 name_index,
|
||||
|
||||
// index into metadata.modes
|
||||
2: required UInt16 mode,
|
||||
|
||||
/**
|
||||
* Inode number. Can be used in different ways:
|
||||
*
|
||||
* - For directories, the inode can be used as an index into
|
||||
* metadata.directories.
|
||||
* - For links, (inode - metadata.link_index_offset) can be
|
||||
* used as an index into metadata.links.
|
||||
* - For files, (inode - metadata.chunk_index_offset) can be
|
||||
* used as in index into metadata.chunk_index.
|
||||
*/
|
||||
3: required UInt32 inode,
|
||||
|
||||
// index into metadata.uids
|
||||
4: required UInt16 owner,
|
||||
|
||||
// index into metadata.gids
|
||||
5: required UInt16 group,
|
||||
|
||||
// atime relative to metadata.timestamp_base
|
||||
6: required UInt64 atime,
|
||||
|
||||
// mtime relative to metadata.timestamp_base
|
||||
7: required UInt64 mtime,
|
||||
|
||||
// ctime relative to metadata.timestamp_base
|
||||
8: required UInt64 ctime,
|
||||
}
|
||||
|
||||
struct metadata {
|
||||
/**
|
||||
* Ranges of chunks that make up regular files. Identical
|
||||
* files share the same inode number. The range of chunks
|
||||
* for a * regular file inode are:
|
||||
*
|
||||
* chunks[chunk_index[inode]] .. chunks[chunk_index[inode + 1] - 1]
|
||||
*/
|
||||
1: required list<chunk> chunks,
|
||||
2: required list<UInt32> chunk_index,
|
||||
3: required list<directory> directories,
|
||||
4: required list<entry> entries,
|
||||
5: required list<UInt32> inode_index,
|
||||
6: required list<UInt32> dir_link_index,
|
||||
|
||||
// all directories, indexed by inode number
|
||||
2: required list<directory> directories,
|
||||
|
||||
// all entries, can be looked up by inode through entry_index
|
||||
3: required list<entry> entries,
|
||||
|
||||
// chunk index, indexed by (inode - chunk_index_offset); this
|
||||
// list has one extra item at the back that points to the end
|
||||
// of `chunks`, so chunk lookups work the same for all inodes
|
||||
4: required list<UInt32> chunk_index,
|
||||
|
||||
// entry index, indexed by inode
|
||||
5: required list<UInt32> entry_index,
|
||||
|
||||
// link index, indexed by (inode - link_index_offset)
|
||||
6: required list<UInt32> link_index,
|
||||
|
||||
// user ids, for lookup by index in entry.owner
|
||||
7: required list<UInt16> uids,
|
||||
|
||||
// group ids, for lookup by index in entry.group
|
||||
8: required list<UInt16> gids,
|
||||
|
||||
// entry modes, for lookup by index in entry.mode
|
||||
9: required list<UInt16> modes,
|
||||
|
||||
// entry names, for lookup by index in entry.name_index
|
||||
10: required list<string> names,
|
||||
|
||||
// link targets, for lookup by index from link_index
|
||||
11: required list<string> links,
|
||||
|
||||
// timestamp base for all entry timestamps
|
||||
12: required UInt64 timestamp_base,
|
||||
|
||||
// inode offset for lookups into chunk_index
|
||||
13: required UInt32 chunk_index_offset;
|
||||
14: required UInt64 total_fs_size;
|
||||
|
||||
// inode offset for lookups into link_index
|
||||
14: required UInt32 link_index_offset;
|
||||
|
||||
// total file system size
|
||||
15: required UInt64 total_fs_size;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user