diff --git a/include/dwarfs/file_scanner.h b/include/dwarfs/file_scanner.h index c26d7863..4b8f4690 100644 --- a/include/dwarfs/file_scanner.h +++ b/include/dwarfs/file_scanner.h @@ -21,6 +21,7 @@ #pragma once +#include #include #include #include @@ -40,13 +41,18 @@ namespace detail { class file_scanner { public: + struct options { + std::optional hash_algo{}; + bool debug_inode_create{false}; + }; + file_scanner(logger& lgr, worker_group& wg, os_access const& os, - inode_manager& im, std::optional const& hash_algo, - progress& prog); + inode_manager& im, progress& prog, options const& opts); void scan(file* p) { impl_->scan(p); } void finalize(uint32_t& inode_num) { impl_->finalize(inode_num); } uint32_t num_unique() const { return impl_->num_unique(); } + void dump(std::ostream& os) const { impl_->dump(os); } class impl { public: @@ -55,6 +61,7 @@ class file_scanner { virtual void scan(file* p) = 0; virtual void finalize(uint32_t& inode_num) = 0; virtual uint32_t num_unique() const = 0; + virtual void dump(std::ostream& os) const = 0; }; private: diff --git a/src/dwarfs/file_scanner.cpp b/src/dwarfs/file_scanner.cpp index 6dc6ac41..9fb9f386 100644 --- a/src/dwarfs/file_scanner.cpp +++ b/src/dwarfs/file_scanner.cpp @@ -19,12 +19,14 @@ * along with dwarfs. If not, see . */ +#include #include #include #include #include #include +#include #include @@ -40,6 +42,16 @@ #include "dwarfs/progress.h" #include "dwarfs/worker_group.h" +template +class fmt::formatter> { + public: + constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); } + template + constexpr auto format(std::pair const& foo, Context& ctx) const { + return fmt::format_to(ctx.out(), "({}, {})", foo.first, foo.second); + } +}; + namespace dwarfs::detail { namespace { @@ -51,14 +63,16 @@ template class file_scanner_ final : public file_scanner::impl { public: file_scanner_(logger& lgr, worker_group& wg, os_access const& os, - inode_manager& im, std::optional const& hash_algo, - progress& prog); + inode_manager& im, progress& prog, + file_scanner::options const& opts); void scan(file* p) override; void finalize(uint32_t& inode_num) override; uint32_t num_unique() const override { return num_unique_; } + void dump(std::ostream& os) const override; + private: class condition_barrier { public: @@ -77,7 +91,7 @@ class file_scanner_ final : public file_scanner::impl { void scan_dedupe(file* p); void hash_file(file* p); - void add_inode(file* p); + void add_inode(file* p, int lineno); template void finalize_hardlinks(Lookup&& lookup); @@ -91,15 +105,47 @@ class file_scanner_ final : public file_scanner::impl { finalize_inodes(std::vector>& ent, uint32_t& inode_num, uint32_t& obj_num); + template + std::string format_key(T const& key) const { + return fmt::format("{}", key); + } + + template + std::string format_key(T const* key) const { + return fmt::format("{}", reinterpret_cast(key)); + } + + std::string format_key(std::string_view key) const { + return fmt::format("{}", folly::hexlify(key)); + } + + void dump_value(std::ostream& os, std::integral auto val) const { + os << fmt::format("{}", val); + } + + void dump_value(std::ostream& os, + std::shared_ptr const&) const { + os << "null"; + } + + void dump_value(std::ostream& os, file const* p) const; + void dump_value(std::ostream& os, inode::files_vector const& vec) const; + + void dump_inodes(std::ostream& os) const; + void dump_inode_create_info(std::ostream& os) const; + + template + void dump_map(std::ostream& os, std::string_view name, T const& map) const; + LOG_PROXY_DECL(LoggerPolicy); worker_group& wg_; os_access const& os_; inode_manager& im_; - std::optional const hash_algo_; progress& prog_; + file_scanner::options const opts_; uint32_t num_unique_{0}; folly::F14FastMap hardlinks_; - std::mutex mx_; + std::mutex mutable mx_; // The pair stores the file size and optionally a hash of the first // 4 KiB of the file. If there's a collision, the worst that can // happen is that we unnecessary hash a file that is not a duplicate. @@ -112,6 +158,13 @@ class file_scanner_ final : public file_scanner::impl { first_file_hashed_; folly::F14FastMap by_raw_inode_; folly::F14FastMap by_hash_; + + struct inode_create_info { + inode const* i; + file const* f; + int line; + }; + std::vector debug_inode_create_; }; // The `unique_size_` table holds an entry for each file size we @@ -154,15 +207,16 @@ class file_scanner_ final : public file_scanner::impl { // it should happen rarely enough to not be a problem. template -file_scanner_::file_scanner_( - logger& lgr, worker_group& wg, os_access const& os, inode_manager& im, - std::optional const& hash_algo, progress& prog) +file_scanner_::file_scanner_(logger& lgr, worker_group& wg, + os_access const& os, + inode_manager& im, progress& prog, + file_scanner::options const& opts) : LOG_PROXY_INIT(lgr) , wg_(wg) , os_(os) , im_(im) - , hash_algo_{hash_algo} - , prog_(prog) {} + , prog_(prog) + , opts_{opts} {} template void file_scanner_::scan(file* p) { @@ -183,15 +237,15 @@ void file_scanner_::scan(file* p) { prog_.original_size += p->size(); - if (hash_algo_) { + if (opts_.hash_algo) { scan_dedupe(p); } else { prog_.current.store(p); - p->scan(nullptr, prog_, hash_algo_); // TODO + p->scan(nullptr, prog_, opts_.hash_algo); // TODO by_raw_inode_[p->raw_inode_num()].push_back(p); - add_inode(p); + add_inode(p, __LINE__); } } @@ -201,7 +255,7 @@ void file_scanner_::finalize(uint32_t& inode_num) { assert(first_file_hashed_.empty()); - if (hash_algo_) { + if (opts_.hash_algo) { finalize_hardlinks([this](file const* p) -> inode::files_vector& { if (auto it = by_hash_.find(p->hash()); it != by_hash_.end()) { return it->second; @@ -263,7 +317,7 @@ void file_scanner_::scan_dedupe(file* p) { { std::lock_guard lock(mx_); - add_inode(p); + add_inode(p, __LINE__); } } else { // This file size has been seen before, so this is potentially @@ -336,14 +390,14 @@ void file_scanner_::scan_dedupe(file* p) { } if (p->is_invalid()) [[unlikely]] { - add_inode(p); + add_inode(p, __LINE__); by_raw_inode_[p->raw_inode_num()].push_back(p); } else { auto& ref = by_hash_[p->hash()]; if (ref.empty()) { // This is *not* a duplicate. We must allocate a new inode. - add_inode(p); + add_inode(p, __LINE__); } else { auto inode = ref.front()->get_inode(); assert(inode); @@ -383,17 +437,21 @@ void file_scanner_::hash_file(file* p) { } prog_.current.store(p); - p->scan(mm.get(), prog_, hash_algo_); + p->scan(mm.get(), prog_, opts_.hash_algo); } template -void file_scanner_::add_inode(file* p) { +void file_scanner_::add_inode(file* p, int lineno) { assert(!p->get_inode()); auto inode = im_.create_inode(); p->set_inode(inode); + if (opts_.debug_inode_create) { + debug_inode_create_.push_back({inode.get(), p, lineno}); + } + im_.scan_background(wg_, os_, std::move(inode), p); } @@ -449,8 +507,7 @@ void file_scanner_::finalize_files( finalize_inodes(ent, inode_num, obj_num); } - tv << "finalized " << ent.size() << (UniqueOnly ? " " : " non-") - << "unique files"; + tv << "finalized " << ent.size() << (UniqueOnly ? " unique" : "") << " files"; } template @@ -458,10 +515,19 @@ template void file_scanner_::finalize_inodes( std::vector>& ent, uint32_t& inode_num, uint32_t& obj_num) { + int const obj_num_before = obj_num; + + auto tv = LOG_TIMED_VERBOSE; + for (auto& p : ent) { auto& files = p.second; if constexpr (Unique) { + DWARFS_CHECK(!files.empty(), + fmt::format("internal error in finalize_inodes: empty files " + "vector for key {}", + p.first)); + // this is true regardless of how the files are ordered if (files.size() > files.front()->refcount()) { continue; @@ -470,15 +536,18 @@ void file_scanner_::finalize_inodes( ++num_unique_; } else { if (files.empty()) { + // This is fine: the !Unique version is *always* called after the Unique + // version, which will have moved the unique file vectors. continue; } DWARFS_CHECK(files.size() > 1, "unexpected non-duplicate file"); - } - // needed for reproducibility - std::sort(files.begin(), files.end(), - [](file const* a, file const* b) { return a->less_revpath(*b); }); + // needed for reproducibility + std::sort(files.begin(), files.end(), [](file const* a, file const* b) { + return a->less_revpath(*b); + }); + } for (auto fp : files) { // need to check because hardlinks share the same number @@ -496,14 +565,141 @@ void file_scanner_::finalize_inodes( ++obj_num; } + + tv << "finalized " << (obj_num - obj_num_before) << (Unique ? " " : " non-") + << "unique inodes"; } + +template +void file_scanner_::dump_value(std::ostream& os, + file const* p) const { + std::shared_ptr ino = p->get_inode(); + auto ino_num = p->inode_num(); + + os << "{\n" + << " \"ptr\": \"" + << fmt::format("{}", reinterpret_cast(p)) << "\",\n" + << " \"path\": " << folly::toJson(p->path_as_string()) << ",\n" + << " \"size\": " << fmt::format("{}", p->size()) << ",\n" + << " \"refcnt\": " << fmt::format("{}", p->refcount()) << ",\n" + << " \"hash\": \"" << folly::hexlify(p->hash()) << "\",\n" + << " \"invalid\": " << (p->is_invalid() ? "true" : "false") << ",\n" + << " \"inode_num\": " + << (ino_num ? fmt::format("{}", *ino_num) : "null") << ",\n" + << " \"inode\": \"" + << fmt::format("{}", reinterpret_cast(ino.get())) << "\"\n" + << " }"; +} + +template +void file_scanner_::dump_value( + std::ostream& os, inode::files_vector const& vec) const { + os << "[\n"; + bool first = true; + for (auto p : vec) { + if (!first) { + os << ",\n"; + } + first = false; + os << " "; + dump_value(os, p); + } + os << "\n ]"; +} + +template +void file_scanner_::dump_inodes(std::ostream& os) const { + os << " \"inodes\": [\n"; + auto inodes = im_.sortable_span(); + inodes.all(); + bool first = true; + for (auto const& ino : inodes) { + if (!first) { + os << ",\n"; + } + first = false; + os << " {\n" + << " \"ptr\": \"" + << fmt::format("{}", reinterpret_cast(ino.get())) << "\",\n" + << " \"files\": "; + dump_value(os, ino->all()); + os << "\n }"; + } + os << "\n ]"; +} + +template +void file_scanner_::dump_inode_create_info( + std::ostream& os) const { + os << " \"inode_create_info\": [\n"; + bool first = true; + for (auto const& ici : debug_inode_create_) { + if (!first) { + os << ",\n"; + } + first = false; + os << " {\n" + << " \"inode\": \"" + << fmt::format("{}", reinterpret_cast(ici.i)) << "\",\n" + << " \"file\": "; + dump_value(os, ici.f); + os << ",\n" + << " \"line\": " << fmt::format("{}", ici.line) << "\n" + << " }"; + } + os << "\n ]"; +} + +template +template +void file_scanner_::dump_map(std::ostream& os, + std::string_view name, + T const& map) const { + os << " \"" << name << "\": {\n"; + + bool first = true; + + for (auto const& [k, v] : map) { + if (!first) { + os << ",\n"; + } + first = false; + os << " \"" << format_key(k) << "\": "; + dump_value(os, v); + } + + os << "\n }"; +} + +template +void file_scanner_::dump(std::ostream& os) const { + std::lock_guard lock(mx_); + + os << "{\n"; + dump_map(os, "hardlinks", hardlinks_); + os << ",\n"; + dump_map(os, "unique_size", unique_size_); + os << ",\n"; + dump_map(os, "file_start_hash", file_start_hash_); + os << ",\n"; + dump_map(os, "first_file_hashed", first_file_hashed_); + os << ",\n"; + dump_map(os, "by_raw_inode", by_raw_inode_); + os << ",\n"; + dump_map(os, "by_hash", by_hash_); + os << ",\n"; + dump_inode_create_info(os); + os << ",\n"; + dump_inodes(os); + os << "\n}\n"; +} + } // namespace file_scanner::file_scanner(logger& lgr, worker_group& wg, os_access const& os, - inode_manager& im, - std::optional const& hash_algo, - progress& prog) + inode_manager& im, progress& prog, + options const& opts) : impl_{make_unique_logging_object( - lgr, wg, os, im, hash_algo, prog)} {} + lgr, wg, os, im, prog, opts)} {} } // namespace dwarfs::detail diff --git a/src/dwarfs/scanner.cpp b/src/dwarfs/scanner.cpp index 8a4e629a..65e03526 100644 --- a/src/dwarfs/scanner.cpp +++ b/src/dwarfs/scanner.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -75,6 +76,10 @@ using namespace std::chrono_literals; namespace { +constexpr std::string_view kEnvVarDumpFilesRaw{"DWARFS_DUMP_FILES_RAW"}; +constexpr std::string_view kEnvVarDumpFilesFinal{"DWARFS_DUMP_FILES_FINAL"}; +constexpr std::string_view kEnvVarDumpInodes{"DWARFS_DUMP_INODES"}; + class visitor_base : public entry_visitor { public: void visit(file*) override {} @@ -302,6 +307,10 @@ class scanner_ final : public scanner::impl { progress& prog, detail::file_scanner& fs, bool debug_filter = false); + void dump_state(std::string_view env_var, std::string_view what, + std::shared_ptr fa, + std::function dumper) const; + LOG_PROXY_DECL(LoggerPolicy); worker_group& wg_; scanner_options const& options_; @@ -432,6 +441,31 @@ scanner_::add_entry(std::filesystem::path const& name, return nullptr; } +template +void scanner_::dump_state( + std::string_view env_var, std::string_view what, + std::shared_ptr fa, + std::function dumper) const { + if (auto dumpfile = os_->getenv(env_var)) { + if (fa) { + LOG_VERBOSE << "dumping " << what << " to " << *dumpfile; + std::error_code ec; + auto ofs = fa->open_output(*dumpfile, ec); + if (ec) { + LOG_ERROR << "cannot open '" << *dumpfile << "': " << ec.message(); + } else { + dumper(ofs->os()); + ofs->close(ec); + if (ec) { + LOG_ERROR << "cannot close '" << *dumpfile << "': " << ec.message(); + } + } + } else { + LOG_ERROR << "cannot dump " << what << ": no file access"; + } + } +} + template std::shared_ptr scanner_::scan_tree(std::filesystem::path const& path, @@ -578,8 +612,11 @@ void scanner_::scan( prog.set_status_function(status_string); inode_manager im(LOG_GET_LOGGER, prog, options_.inode); - detail::file_scanner fs(LOG_GET_LOGGER, wg_, *os_, im, - options_.file_hash_algorithm, prog); + detail::file_scanner fs( + LOG_GET_LOGGER, wg_, *os_, im, prog, + {.hash_algo = options_.file_hash_algorithm, + .debug_inode_create = os_->getenv(kEnvVarDumpFilesRaw) || + os_->getenv(kEnvVarDumpFilesFinal)}); auto root = list ? scan_list(path, *list, prog, fs) : scan_tree(path, prog, fs); @@ -611,10 +648,16 @@ void scanner_::scan( LOG_INFO << "scanning CPU time: " << time_with_unit(wg_.get_cpu_time().value_or(0ns)); + dump_state(kEnvVarDumpFilesRaw, "raw files", fa, + [&fs](auto& os) { fs.dump(os); }); + LOG_INFO << "finalizing file inodes..."; uint32_t first_device_inode = first_file_inode; fs.finalize(first_device_inode); + dump_state(kEnvVarDumpFilesFinal, "final files", fa, + [&fs](auto& os) { fs.dump(os); }); + // this must be done after finalizing the inodes since this is when // the file vectors are populated if (im.has_invalid_inodes()) { @@ -675,24 +718,7 @@ void scanner_::scan( }); }); - if (auto dumpfile = os_->getenv("DWARFS_DUMP_INODES")) { - if (fa) { - LOG_VERBOSE << "dumping inodes to " << *dumpfile; - std::error_code ec; - auto ofs = fa->open_output(*dumpfile, ec); - if (ec) { - LOG_ERROR << "cannot open '" << *dumpfile << "': " << ec.message(); - } else { - im.dump(ofs->os()); - ofs->close(ec); - if (ec) { - LOG_ERROR << "cannot close '" << *dumpfile << "': " << ec.message(); - } - } - } else { - LOG_ERROR << "cannot dump inodes: no file access"; - } - } + dump_state(kEnvVarDumpInodes, "inodes", fa, [&im](auto& os) { im.dump(os); }); LOG_INFO << "building blocks...";