diff --git a/include/dwarfs/entry.h b/include/dwarfs/entry.h index f36cc57e..9de29345 100644 --- a/include/dwarfs/entry.h +++ b/include/dwarfs/entry.h @@ -57,7 +57,6 @@ class entry : public file_interface { entry(const std::string& name, std::shared_ptr parent, const struct ::stat& st); - void scan(os_access& os, progress& prog); bool has_parent() const; std::shared_ptr parent() const; void set_name(const std::string& name); @@ -73,9 +72,10 @@ class entry : public file_interface { void update(global_entry_data& data) const; virtual void accept(entry_visitor& v, bool preorder = false) = 0; virtual uint32_t inode_num() const = 0; + virtual void scan(os_access& os, progress& prog) = 0; protected: - virtual void scan(os_access& os, const std::string& p, progress& prog) = 0; + void scan_stat(os_access& os, std::string const& p); private: std::string name_; @@ -97,14 +97,14 @@ class file : public entry { void accept(entry_visitor& v, bool preorder) override; uint32_t inode_num() const override; uint32_t similarity_hash() const { return similarity_hash_; } - - protected: - void scan(os_access& os, const std::string& p, progress& prog) override; + void scan(os_access& os, progress& prog) override; private: + using hash_type = std::array; + uint32_t similarity_hash_{0}; const bool with_similarity_; - std::array hash_{0}; + hash_type hash_{0}; std::shared_ptr inode_; }; @@ -124,10 +124,9 @@ class dir : public entry { void pack_entry(thrift::metadata::metadata& mv2, global_entry_data const& data) const; uint32_t inode_num() const override { return inode_; } + void scan(os_access& os, progress& prog) override; - protected: - void scan(os_access& os, const std::string& p, progress& prog) override; - + private: using entry_ptr = std::shared_ptr; std::vector> entries_; @@ -143,9 +142,7 @@ class link : public entry { void set_inode(uint32_t inode); void accept(entry_visitor& v, bool preorder) override; uint32_t inode_num() const override { return inode_; } - - protected: - void scan(os_access& os, const std::string& p, progress& prog) override; + void scan(os_access& os, progress& prog) override; private: std::string link_; diff --git a/include/dwarfs/filesystem_v2.h b/include/dwarfs/filesystem_v2.h index d77cb25c..88c0138a 100644 --- a/include/dwarfs/filesystem_v2.h +++ b/include/dwarfs/filesystem_v2.h @@ -54,7 +54,8 @@ class filesystem_v2 { static void rewrite(logger& lgr, progress& prog, std::shared_ptr mm, filesystem_writer& writer); - static void identify(logger& lgr, std::shared_ptr mm, std::ostream& os); + static void identify(logger& lgr, std::shared_ptr mm, std::ostream& os, + int detail_level = 0); void dump(std::ostream& os, int detail_level) const { impl_->dump(os, detail_level); diff --git a/src/dwarfs/entry.cpp b/src/dwarfs/entry.cpp index d6bb92be..131ce516 100644 --- a/src/dwarfs/entry.cpp +++ b/src/dwarfs/entry.cpp @@ -44,10 +44,8 @@ entry::entry(const std::string& name, std::shared_ptr parent, , parent_(std::move(parent)) , stat_(st) {} -void entry::scan(os_access& os, progress& prog) { - const std::string& p = path(); +void entry::scan_stat(os_access& os, std::string const& p) { os.lstat(p, &stat_); - scan(os, p, prog); } bool entry::has_parent() const { @@ -129,12 +127,13 @@ uint32_t file::inode_num() const { return inode_->num(); } void file::accept(entry_visitor& v, bool) { v.visit(this); } -void file::scan(os_access& os, const std::string& p, progress& prog) { - assert(SHA_DIGEST_LENGTH == hash_.size()); +void file::scan(os_access& os, progress& prog) { + static_assert(SHA_DIGEST_LENGTH == sizeof(hash_type)); - size_t s = size(); + auto p = path(); + scan_stat(os, p); - if (s > 0) { + if (size_t s = size(); s > 0) { prog.original_size += s; auto mm = os.map_file(p, s); ::SHA1(mm->as(), s, @@ -191,7 +190,7 @@ void dir::sort() { void dir::set_inode(uint32_t inode) { inode_ = inode; } -void dir::scan(os_access&, const std::string&, progress&) {} +void dir::scan(os_access& os, progress&) { scan_stat(os, path()); } void dir::pack_entry(thrift::metadata::metadata& mv2, global_entry_data const& data) const { @@ -223,7 +222,9 @@ void link::set_inode(uint32_t inode) { inode_ = inode; } void link::accept(entry_visitor& v, bool) { v.visit(this); } -void link::scan(os_access& os, const std::string& p, progress& prog) { +void link::scan(os_access& os, progress& prog) { + auto p = path(); + scan_stat(os, p); link_ = os.readlink(p, size()); prog.original_size += size(); } diff --git a/src/dwarfs/filesystem_v2.cpp b/src/dwarfs/filesystem_v2.cpp index 7380fe9f..b424c645 100644 --- a/src/dwarfs/filesystem_v2.cpp +++ b/src/dwarfs/filesystem_v2.cpp @@ -386,7 +386,7 @@ void filesystem_v2::rewrite(logger& lgr, progress& prog, } void filesystem_v2::identify(logger& lgr, std::shared_ptr mm, - std::ostream& os) { + std::ostream& os, int detail_level) { // TODO: log_proxy log(lgr); filesystem_parser parser(mm); @@ -417,7 +417,7 @@ void filesystem_v2::identify(logger& lgr, std::shared_ptr mm, auto meta = make_metadata(lgr, mm, sections, schema_raw, meta_raw); - meta.dump(os, 0, [](const std::string&, uint32_t) {}); + meta.dump(os, detail_level, [](const std::string&, uint32_t) {}); } } // namespace dwarfs diff --git a/src/dwarfs/scanner.cpp b/src/dwarfs/scanner.cpp index 85b3deb3..d9533255 100644 --- a/src/dwarfs/scanner.cpp +++ b/src/dwarfs/scanner.cpp @@ -82,14 +82,73 @@ scanner_::scanner_(logger& lgr, worker_group& wg, , lgr_(lgr) , log_(lgr) {} -class dir_set_inode_visitor : public entry_visitor { +class visitor_base : public entry_visitor { + public: + void visit(file*) override {} + void visit(link*) override {} + void visit(dir*) override {} +}; + +class scan_files_visitor : public visitor_base { + public: + scan_files_visitor(worker_group& wg, os_access& os, progress& prog) + : wg_(wg) + , os_(os) + , prog_(prog) {} + + void visit(file* p) override { + wg_.add_job([=] { + prog_.current.store(p); + p->scan(os_, prog_); + prog_.files_scanned++; + }); + } + + private: + worker_group& wg_; + os_access& os_; + progress& prog_; +}; + +class file_deduplication_visitor : public visitor_base { + public: + void visit(file* p) override { hash_[p->hash()].push_back(p); } + + void deduplicate_files(inode_manager& im, progress& prog) { + for (auto& p : hash_) { + auto& files = p.second; + + if (files.size() > 1) { + std::sort(files.begin(), files.end(), [](file const* a, file const* b) { + return a->path() < b->path(); + }); + } + + auto first = files.front(); + { + auto inode = im.create_inode(); + first->set_inode(inode); + inode->set_file(first); + } + + if (files.size() > 1) { + for (auto i = begin(files) + 1; i != end(files); ++i) { + (*i)->set_inode(first->get_inode()); + prog.duplicate_files++; + prog.saved_by_deduplication += (*i)->size(); + } + } + } + } + + private: + std::unordered_map, folly::Hash> hash_; +}; + +class dir_set_inode_visitor : public visitor_base { public: dir_set_inode_visitor(uint32_t& inode_no) - : inode_no_(inode_no){}; - - void visit(file*) override {} - - void visit(link*) override {} + : inode_no_(inode_no) {} void visit(dir* p) override { p->sort(); @@ -102,17 +161,13 @@ class dir_set_inode_visitor : public entry_visitor { uint32_t& inode_no_; }; -class link_set_inode_visitor : public entry_visitor { +class link_set_inode_visitor : public visitor_base { public: link_set_inode_visitor(uint32_t& inode_no) - : inode_no_(inode_no){}; - - void visit(file*) override {} + : inode_no_(inode_no) {} void visit(link* p) override { p->set_inode(inode_no_++); } - void visit(dir*) override {} - private: uint32_t& inode_no_; }; @@ -139,7 +194,7 @@ class names_and_links_visitor : public entry_visitor { global_entry_data& data_; }; -class save_directories_visitor : public entry_visitor { +class save_directories_visitor : public visitor_base { public: save_directories_visitor(thrift::metadata::metadata& mv2, global_entry_data const& ge_data, @@ -148,14 +203,6 @@ class save_directories_visitor : public entry_visitor { , ge_data_(ge_data) , dir_index_(dir_index) {} - void visit(file*) override { - // nothing - } - - void visit(link*) override { - // nothing - } - void visit(dir* p) override { dir_index_.at(p->inode_num()) = mv2_.directories.size(); p->pack(mv2_, ge_data_); @@ -268,15 +315,8 @@ void scanner_::scan(filesystem_writer& fsw, } // now scan all files - root->walk([&](entry* ep) { - wg_.add_job([=, &prog] { - if (ep->type() == entry::E_FILE) { - prog.current.store(ep); - ep->scan(*os_, prog); - prog.files_scanned++; - } - }); - }); + scan_files_visitor sfv(wg_, *os_, prog); + root->accept(sfv); log_.info() << "waiting for background scanners..."; wg_.wait(); @@ -284,46 +324,24 @@ void scanner_::scan(filesystem_writer& fsw, std::unordered_map, folly::Hash> file_hash; - // TODO: turn into visitor? - root->walk([&](entry* ep) { - if (auto fp = dynamic_cast(ep)) { - file_hash[fp->hash()].push_back(fp); - } - }); - - log_.info() << "finding duplicate files..."; + log_.info() << "assigning directory and link inodes..."; uint32_t first_link_inode = 0; dir_set_inode_visitor dsiv(first_link_inode); root->accept(dsiv, true); + uint32_t first_file_inode = first_link_inode; link_set_inode_visitor lsiv(first_file_inode); root->accept(lsiv, true); + log_.info() << "finding duplicate files..."; + auto im = inode_manager::create(); - for (auto& p : file_hash) { - if (p.second.size() > 1) { - std::sort( - p.second.begin(), p.second.end(), - [](file const* a, file const* b) { return a->path() < b->path(); }); - } + file_deduplication_visitor fdv; + root->accept(fdv); - auto first = p.second.front(); - { - auto inode = im->create_inode(); - first->set_inode(inode); - inode->set_file(first); - } - - if (p.second.size() > 1) { - for (auto i = begin(p.second) + 1; i != end(p.second); ++i) { - (*i)->set_inode(first->get_inode()); - prog.duplicate_files++; - prog.saved_by_deduplication += (*i)->size(); - } - } - } + fdv.deduplicate_files(*im, prog); log_.info() << "saved " << size_with_unit(prog.saved_by_deduplication) << " / " << size_with_unit(prog.original_size) << " in " @@ -357,7 +375,7 @@ void scanner_::scan(filesystem_writer& fsw, } } - log_.info() << "numbering file inodes..."; + log_.info() << "assigning file inodes..."; im->number_inodes(first_file_inode); log_.info() << "building metadata..."; diff --git a/src/dwarfsck.cpp b/src/dwarfsck.cpp index 4d3b4db0..bd6008c1 100644 --- a/src/dwarfsck.cpp +++ b/src/dwarfsck.cpp @@ -45,8 +45,8 @@ int main(int argc, char** argv) { } } else { // TODO: add more usage options... - // dwarfs::filesystem_v2::identify(lgr, mm, std::cout); - fs.dump(std::cout, 1); + dwarfs::filesystem_v2::identify(lgr, mm, std::cout, 1); + // fs.dump(std::cout, 1); } } catch (const std::exception& e) { std::cerr << "Error: " << e.what() << std::endl;