mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-13 06:16:55 -04:00
metadata_v2: more refactoring
This commit is contained in:
parent
84c413cfd7
commit
fdcccb3095
@ -57,7 +57,6 @@ class entry : public file_interface {
|
||||
entry(const std::string& name, std::shared_ptr<entry> parent,
|
||||
const struct ::stat& st);
|
||||
|
||||
void scan(os_access& os, progress& prog);
|
||||
bool has_parent() const;
|
||||
std::shared_ptr<entry> parent() const;
|
||||
void set_name(const std::string& name);
|
||||
@ -73,9 +72,10 @@ class entry : public file_interface {
|
||||
void update(global_entry_data& data) const;
|
||||
virtual void accept(entry_visitor& v, bool preorder = false) = 0;
|
||||
virtual uint32_t inode_num() const = 0;
|
||||
virtual void scan(os_access& os, progress& prog) = 0;
|
||||
|
||||
protected:
|
||||
virtual void scan(os_access& os, const std::string& p, progress& prog) = 0;
|
||||
void scan_stat(os_access& os, std::string const& p);
|
||||
|
||||
private:
|
||||
std::string name_;
|
||||
@ -97,14 +97,14 @@ class file : public entry {
|
||||
void accept(entry_visitor& v, bool preorder) override;
|
||||
uint32_t inode_num() const override;
|
||||
uint32_t similarity_hash() const { return similarity_hash_; }
|
||||
|
||||
protected:
|
||||
void scan(os_access& os, const std::string& p, progress& prog) override;
|
||||
void scan(os_access& os, progress& prog) override;
|
||||
|
||||
private:
|
||||
using hash_type = std::array<char, 20>;
|
||||
|
||||
uint32_t similarity_hash_{0};
|
||||
const bool with_similarity_;
|
||||
std::array<char, 20> hash_{0};
|
||||
hash_type hash_{0};
|
||||
std::shared_ptr<inode> inode_;
|
||||
};
|
||||
|
||||
@ -124,10 +124,9 @@ class dir : public entry {
|
||||
void pack_entry(thrift::metadata::metadata& mv2,
|
||||
global_entry_data const& data) const;
|
||||
uint32_t inode_num() const override { return inode_; }
|
||||
void scan(os_access& os, progress& prog) override;
|
||||
|
||||
protected:
|
||||
void scan(os_access& os, const std::string& p, progress& prog) override;
|
||||
|
||||
private:
|
||||
using entry_ptr = std::shared_ptr<entry>;
|
||||
|
||||
std::vector<std::shared_ptr<entry>> entries_;
|
||||
@ -143,9 +142,7 @@ class link : public entry {
|
||||
void set_inode(uint32_t inode);
|
||||
void accept(entry_visitor& v, bool preorder) override;
|
||||
uint32_t inode_num() const override { return inode_; }
|
||||
|
||||
protected:
|
||||
void scan(os_access& os, const std::string& p, progress& prog) override;
|
||||
void scan(os_access& os, progress& prog) override;
|
||||
|
||||
private:
|
||||
std::string link_;
|
||||
|
@ -54,7 +54,8 @@ class filesystem_v2 {
|
||||
static void rewrite(logger& lgr, progress& prog, std::shared_ptr<mmif> mm,
|
||||
filesystem_writer& writer);
|
||||
|
||||
static void identify(logger& lgr, std::shared_ptr<mmif> mm, std::ostream& os);
|
||||
static void identify(logger& lgr, std::shared_ptr<mmif> mm, std::ostream& os,
|
||||
int detail_level = 0);
|
||||
|
||||
void dump(std::ostream& os, int detail_level) const {
|
||||
impl_->dump(os, detail_level);
|
||||
|
@ -44,10 +44,8 @@ entry::entry(const std::string& name, std::shared_ptr<entry> parent,
|
||||
, parent_(std::move(parent))
|
||||
, stat_(st) {}
|
||||
|
||||
void entry::scan(os_access& os, progress& prog) {
|
||||
const std::string& p = path();
|
||||
void entry::scan_stat(os_access& os, std::string const& p) {
|
||||
os.lstat(p, &stat_);
|
||||
scan(os, p, prog);
|
||||
}
|
||||
|
||||
bool entry::has_parent() const {
|
||||
@ -129,12 +127,13 @@ uint32_t file::inode_num() const { return inode_->num(); }
|
||||
|
||||
void file::accept(entry_visitor& v, bool) { v.visit(this); }
|
||||
|
||||
void file::scan(os_access& os, const std::string& p, progress& prog) {
|
||||
assert(SHA_DIGEST_LENGTH == hash_.size());
|
||||
void file::scan(os_access& os, progress& prog) {
|
||||
static_assert(SHA_DIGEST_LENGTH == sizeof(hash_type));
|
||||
|
||||
size_t s = size();
|
||||
auto p = path();
|
||||
scan_stat(os, p);
|
||||
|
||||
if (s > 0) {
|
||||
if (size_t s = size(); s > 0) {
|
||||
prog.original_size += s;
|
||||
auto mm = os.map_file(p, s);
|
||||
::SHA1(mm->as<unsigned char>(), s,
|
||||
@ -191,7 +190,7 @@ void dir::sort() {
|
||||
|
||||
void dir::set_inode(uint32_t inode) { inode_ = inode; }
|
||||
|
||||
void dir::scan(os_access&, const std::string&, progress&) {}
|
||||
void dir::scan(os_access& os, progress&) { scan_stat(os, path()); }
|
||||
|
||||
void dir::pack_entry(thrift::metadata::metadata& mv2,
|
||||
global_entry_data const& data) const {
|
||||
@ -223,7 +222,9 @@ void link::set_inode(uint32_t inode) { inode_ = inode; }
|
||||
|
||||
void link::accept(entry_visitor& v, bool) { v.visit(this); }
|
||||
|
||||
void link::scan(os_access& os, const std::string& p, progress& prog) {
|
||||
void link::scan(os_access& os, progress& prog) {
|
||||
auto p = path();
|
||||
scan_stat(os, p);
|
||||
link_ = os.readlink(p, size());
|
||||
prog.original_size += size();
|
||||
}
|
||||
|
@ -386,7 +386,7 @@ void filesystem_v2::rewrite(logger& lgr, progress& prog,
|
||||
}
|
||||
|
||||
void filesystem_v2::identify(logger& lgr, std::shared_ptr<mmif> mm,
|
||||
std::ostream& os) {
|
||||
std::ostream& os, int detail_level) {
|
||||
// TODO:
|
||||
log_proxy<debug_logger_policy> log(lgr);
|
||||
filesystem_parser parser(mm);
|
||||
@ -417,7 +417,7 @@ void filesystem_v2::identify(logger& lgr, std::shared_ptr<mmif> mm,
|
||||
|
||||
auto meta = make_metadata(lgr, mm, sections, schema_raw, meta_raw);
|
||||
|
||||
meta.dump(os, 0, [](const std::string&, uint32_t) {});
|
||||
meta.dump(os, detail_level, [](const std::string&, uint32_t) {});
|
||||
}
|
||||
|
||||
} // namespace dwarfs
|
||||
|
@ -82,14 +82,73 @@ scanner_<LoggerPolicy>::scanner_(logger& lgr, worker_group& wg,
|
||||
, lgr_(lgr)
|
||||
, log_(lgr) {}
|
||||
|
||||
class dir_set_inode_visitor : public entry_visitor {
|
||||
class visitor_base : public entry_visitor {
|
||||
public:
|
||||
void visit(file*) override {}
|
||||
void visit(link*) override {}
|
||||
void visit(dir*) override {}
|
||||
};
|
||||
|
||||
class scan_files_visitor : public visitor_base {
|
||||
public:
|
||||
scan_files_visitor(worker_group& wg, os_access& os, progress& prog)
|
||||
: wg_(wg)
|
||||
, os_(os)
|
||||
, prog_(prog) {}
|
||||
|
||||
void visit(file* p) override {
|
||||
wg_.add_job([=] {
|
||||
prog_.current.store(p);
|
||||
p->scan(os_, prog_);
|
||||
prog_.files_scanned++;
|
||||
});
|
||||
}
|
||||
|
||||
private:
|
||||
worker_group& wg_;
|
||||
os_access& os_;
|
||||
progress& prog_;
|
||||
};
|
||||
|
||||
class file_deduplication_visitor : public visitor_base {
|
||||
public:
|
||||
void visit(file* p) override { hash_[p->hash()].push_back(p); }
|
||||
|
||||
void deduplicate_files(inode_manager& im, progress& prog) {
|
||||
for (auto& p : hash_) {
|
||||
auto& files = p.second;
|
||||
|
||||
if (files.size() > 1) {
|
||||
std::sort(files.begin(), files.end(), [](file const* a, file const* b) {
|
||||
return a->path() < b->path();
|
||||
});
|
||||
}
|
||||
|
||||
auto first = files.front();
|
||||
{
|
||||
auto inode = im.create_inode();
|
||||
first->set_inode(inode);
|
||||
inode->set_file(first);
|
||||
}
|
||||
|
||||
if (files.size() > 1) {
|
||||
for (auto i = begin(files) + 1; i != end(files); ++i) {
|
||||
(*i)->set_inode(first->get_inode());
|
||||
prog.duplicate_files++;
|
||||
prog.saved_by_deduplication += (*i)->size();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
std::unordered_map<std::string_view, std::vector<file*>, folly::Hash> hash_;
|
||||
};
|
||||
|
||||
class dir_set_inode_visitor : public visitor_base {
|
||||
public:
|
||||
dir_set_inode_visitor(uint32_t& inode_no)
|
||||
: inode_no_(inode_no){};
|
||||
|
||||
void visit(file*) override {}
|
||||
|
||||
void visit(link*) override {}
|
||||
: inode_no_(inode_no) {}
|
||||
|
||||
void visit(dir* p) override {
|
||||
p->sort();
|
||||
@ -102,17 +161,13 @@ class dir_set_inode_visitor : public entry_visitor {
|
||||
uint32_t& inode_no_;
|
||||
};
|
||||
|
||||
class link_set_inode_visitor : public entry_visitor {
|
||||
class link_set_inode_visitor : public visitor_base {
|
||||
public:
|
||||
link_set_inode_visitor(uint32_t& inode_no)
|
||||
: inode_no_(inode_no){};
|
||||
|
||||
void visit(file*) override {}
|
||||
: inode_no_(inode_no) {}
|
||||
|
||||
void visit(link* p) override { p->set_inode(inode_no_++); }
|
||||
|
||||
void visit(dir*) override {}
|
||||
|
||||
private:
|
||||
uint32_t& inode_no_;
|
||||
};
|
||||
@ -139,7 +194,7 @@ class names_and_links_visitor : public entry_visitor {
|
||||
global_entry_data& data_;
|
||||
};
|
||||
|
||||
class save_directories_visitor : public entry_visitor {
|
||||
class save_directories_visitor : public visitor_base {
|
||||
public:
|
||||
save_directories_visitor(thrift::metadata::metadata& mv2,
|
||||
global_entry_data const& ge_data,
|
||||
@ -148,14 +203,6 @@ class save_directories_visitor : public entry_visitor {
|
||||
, ge_data_(ge_data)
|
||||
, dir_index_(dir_index) {}
|
||||
|
||||
void visit(file*) override {
|
||||
// nothing
|
||||
}
|
||||
|
||||
void visit(link*) override {
|
||||
// nothing
|
||||
}
|
||||
|
||||
void visit(dir* p) override {
|
||||
dir_index_.at(p->inode_num()) = mv2_.directories.size();
|
||||
p->pack(mv2_, ge_data_);
|
||||
@ -268,15 +315,8 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
||||
}
|
||||
|
||||
// now scan all files
|
||||
root->walk([&](entry* ep) {
|
||||
wg_.add_job([=, &prog] {
|
||||
if (ep->type() == entry::E_FILE) {
|
||||
prog.current.store(ep);
|
||||
ep->scan(*os_, prog);
|
||||
prog.files_scanned++;
|
||||
}
|
||||
});
|
||||
});
|
||||
scan_files_visitor sfv(wg_, *os_, prog);
|
||||
root->accept(sfv);
|
||||
|
||||
log_.info() << "waiting for background scanners...";
|
||||
wg_.wait();
|
||||
@ -284,46 +324,24 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
||||
std::unordered_map<std::string_view, std::vector<file*>, folly::Hash>
|
||||
file_hash;
|
||||
|
||||
// TODO: turn into visitor?
|
||||
root->walk([&](entry* ep) {
|
||||
if (auto fp = dynamic_cast<file*>(ep)) {
|
||||
file_hash[fp->hash()].push_back(fp);
|
||||
}
|
||||
});
|
||||
|
||||
log_.info() << "finding duplicate files...";
|
||||
log_.info() << "assigning directory and link inodes...";
|
||||
|
||||
uint32_t first_link_inode = 0;
|
||||
dir_set_inode_visitor dsiv(first_link_inode);
|
||||
root->accept(dsiv, true);
|
||||
|
||||
uint32_t first_file_inode = first_link_inode;
|
||||
link_set_inode_visitor lsiv(first_file_inode);
|
||||
root->accept(lsiv, true);
|
||||
|
||||
log_.info() << "finding duplicate files...";
|
||||
|
||||
auto im = inode_manager::create();
|
||||
|
||||
for (auto& p : file_hash) {
|
||||
if (p.second.size() > 1) {
|
||||
std::sort(
|
||||
p.second.begin(), p.second.end(),
|
||||
[](file const* a, file const* b) { return a->path() < b->path(); });
|
||||
}
|
||||
file_deduplication_visitor fdv;
|
||||
root->accept(fdv);
|
||||
|
||||
auto first = p.second.front();
|
||||
{
|
||||
auto inode = im->create_inode();
|
||||
first->set_inode(inode);
|
||||
inode->set_file(first);
|
||||
}
|
||||
|
||||
if (p.second.size() > 1) {
|
||||
for (auto i = begin(p.second) + 1; i != end(p.second); ++i) {
|
||||
(*i)->set_inode(first->get_inode());
|
||||
prog.duplicate_files++;
|
||||
prog.saved_by_deduplication += (*i)->size();
|
||||
}
|
||||
}
|
||||
}
|
||||
fdv.deduplicate_files(*im, prog);
|
||||
|
||||
log_.info() << "saved " << size_with_unit(prog.saved_by_deduplication)
|
||||
<< " / " << size_with_unit(prog.original_size) << " in "
|
||||
@ -357,7 +375,7 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
||||
}
|
||||
}
|
||||
|
||||
log_.info() << "numbering file inodes...";
|
||||
log_.info() << "assigning file inodes...";
|
||||
im->number_inodes(first_file_inode);
|
||||
|
||||
log_.info() << "building metadata...";
|
||||
|
@ -45,8 +45,8 @@ int main(int argc, char** argv) {
|
||||
}
|
||||
} else {
|
||||
// TODO: add more usage options...
|
||||
// dwarfs::filesystem_v2::identify(lgr, mm, std::cout);
|
||||
fs.dump(std::cout, 1);
|
||||
dwarfs::filesystem_v2::identify(lgr, mm, std::cout, 1);
|
||||
// fs.dump(std::cout, 1);
|
||||
}
|
||||
} catch (const std::exception& e) {
|
||||
std::cerr << "Error: " << e.what() << std::endl;
|
||||
|
Loading…
x
Reference in New Issue
Block a user