diff --git a/include/dwarfs/writer/fragment_order_options.h b/include/dwarfs/writer/fragment_order_options.h index e466a242..79d5aba7 100644 --- a/include/dwarfs/writer/fragment_order_options.h +++ b/include/dwarfs/writer/fragment_order_options.h @@ -23,14 +23,22 @@ #pragma once +#include #include +#include +#include namespace dwarfs::writer { -// TODO: rename? -> inode_order_mode / fragment_order_mode -enum class fragment_order_mode { NONE, PATH, REVPATH, SIMILARITY, NILSIMSA }; +enum class fragment_order_mode { + NONE, + PATH, + REVPATH, + SIMILARITY, + NILSIMSA, + EXPLICIT +}; -// TODO: rename? -> inode_order_options / fragment_order_options struct fragment_order_options { static constexpr int const kDefaultNilsimsaMaxChildren{16384}; static constexpr int const kDefaultNilsimsaMaxClusterSize{16384}; @@ -38,6 +46,8 @@ struct fragment_order_options { fragment_order_mode mode{fragment_order_mode::NONE}; int nilsimsa_max_children{kDefaultNilsimsaMaxChildren}; int nilsimsa_max_cluster_size{kDefaultNilsimsaMaxClusterSize}; + std::string explicit_order_file{}; + std::unordered_map explicit_order{}; }; std::ostream& operator<<(std::ostream& os, fragment_order_mode mode); diff --git a/include/dwarfs/writer/internal/inode_manager.h b/include/dwarfs/writer/internal/inode_manager.h index 4059068a..ef4ebdeb 100644 --- a/include/dwarfs/writer/internal/inode_manager.h +++ b/include/dwarfs/writer/internal/inode_manager.h @@ -24,6 +24,7 @@ #pragma once #include +#include #include #include #include @@ -77,7 +78,9 @@ class inode_manager { size_t total_size{0}; }; - inode_manager(logger& lgr, progress& prog, inode_options const& opts); + inode_manager(logger& lgr, progress& prog, + std::filesystem::path const& root_path, + inode_options const& opts); std::shared_ptr create_inode() { return impl_->create_inode(); } diff --git a/include/dwarfs/writer/internal/inode_ordering.h b/include/dwarfs/writer/internal/inode_ordering.h index d1e4ec6e..270d3446 100644 --- a/include/dwarfs/writer/internal/inode_ordering.h +++ b/include/dwarfs/writer/internal/inode_ordering.h @@ -23,7 +23,9 @@ #pragma once +#include #include +#include #include @@ -40,6 +42,7 @@ class worker_group; namespace writer { struct inode_options; +struct fragment_order_options; namespace internal { @@ -71,6 +74,12 @@ class inode_ordering { impl_->by_nilsimsa(wg, opts, sp, cat); } + void by_explicit_order(sortable_inode_span& sp, + std::filesystem::path const& root_path, + fragment_order_options const& opts) const { + impl_->by_explicit_order(sp, root_path, opts); + } + class impl { public: virtual ~impl() = default; @@ -84,6 +93,10 @@ class inode_ordering { by_nilsimsa(dwarfs::internal::worker_group& wg, similarity_ordering_options const& opts, sortable_inode_span& sp, fragment_category cat) const = 0; + virtual void + by_explicit_order(sortable_inode_span& sp, + std::filesystem::path const& root_path, + fragment_order_options const& opts) const = 0; }; private: diff --git a/src/writer/fragment_order_options.cpp b/src/writer/fragment_order_options.cpp index 4d5a1511..266c119e 100644 --- a/src/writer/fragment_order_options.cpp +++ b/src/writer/fragment_order_options.cpp @@ -50,6 +50,9 @@ std::ostream& operator<<(std::ostream& os, fragment_order_mode mode) { case fragment_order_mode::NILSIMSA: modestr = "nilsimsa"; break; + case fragment_order_mode::EXPLICIT: + modestr = "explicit"; + break; } return os << modestr; diff --git a/src/writer/fragment_order_parser.cpp b/src/writer/fragment_order_parser.cpp index b820044c..c14b3297 100644 --- a/src/writer/fragment_order_parser.cpp +++ b/src/writer/fragment_order_parser.cpp @@ -22,6 +22,7 @@ */ #include +#include #include #include @@ -47,6 +48,7 @@ constexpr std::array order_choices{ std::pair{"revpath"sv, fragment_order_mode::REVPATH}, std::pair{"similarity"sv, fragment_order_mode::SIMILARITY}, std::pair{"nilsimsa"sv, fragment_order_mode::NILSIMSA}, + std::pair{"explicit"sv, fragment_order_mode::EXPLICIT}, }; } // namespace @@ -95,6 +97,21 @@ fragment_order_parser::parse(std::string_view arg) const { } break; + case fragment_order_mode::EXPLICIT: { + auto file = om.get("file"); + std::ifstream ifs{file}; + if (!ifs) { + throw std::runtime_error( + fmt::format("failed to open explicit order file '{}'", file)); + } + std::string line; + while (std::getline(ifs, line)) { + auto const path = std::filesystem::path{line}.relative_path(); + rv.explicit_order[path] = rv.explicit_order.size(); + } + rv.explicit_order_file = std::move(file); + } break; + default: throw std::runtime_error( fmt::format("inode order mode '{}' does not support options", algo)); @@ -125,6 +142,9 @@ fragment_order_parser::to_string(fragment_order_options const& opts) const { return fmt::format("nilsimsa:max_children={}:max_cluster_size={}", opts.nilsimsa_max_children, opts.nilsimsa_max_cluster_size); + + case fragment_order_mode::EXPLICIT: + return fmt::format("explicit:file={}", opts.explicit_order_file); } return ""; } diff --git a/src/writer/internal/inode_manager.cpp b/src/writer/internal/inode_manager.cpp index 11acbf60..6e8c9ae8 100644 --- a/src/writer/internal/inode_manager.cpp +++ b/src/writer/internal/inode_manager.cpp @@ -68,6 +68,7 @@ namespace dwarfs::writer::internal { using namespace dwarfs::internal; +namespace fs = std::filesystem; namespace { @@ -430,6 +431,7 @@ class inode_ : public inode { case fragment_order_mode::NONE: case fragment_order_mode::PATH: case fragment_order_mode::REVPATH: + case fragment_order_mode::EXPLICIT: break; case fragment_order_mode::SIMILARITY: sc.try_emplace(cat); @@ -493,6 +495,7 @@ class inode_ : public inode { case fragment_order_mode::NONE: case fragment_order_mode::PATH: case fragment_order_mode::REVPATH: + case fragment_order_mode::EXPLICIT: break; case fragment_order_mode::SIMILARITY: { @@ -545,9 +548,11 @@ class inode_ : public inode { template class inode_manager_ final : public inode_manager::impl { public: - inode_manager_(logger& lgr, progress& prog, inode_options const& opts) + inode_manager_(logger& lgr, progress& prog, fs::path const& root_path, + inode_options const& opts) : LOG_PROXY_INIT(lgr) , prog_(prog) + , root_path_{root_path} , opts_{opts} , inodes_need_scanning_{inodes_need_scanning(opts_)} {} @@ -657,6 +662,7 @@ class inode_manager_ final : public inode_manager::impl { LOG_PROXY_DECL(LoggerPolicy); std::vector> inodes_; progress& prog_; + fs::path const root_path_; inode_options opts_; bool const inodes_need_scanning_; std::atomic mutable num_invalid_inodes_{0}; @@ -816,14 +822,25 @@ auto inode_manager_::ordered_span(fragment_category cat, tv << prefix << span.size() << " inodes ordered"; break; } + + case fragment_order_mode::EXPLICIT: { + LOG_VERBOSE << prefix << "ordering " << span.size() + << " inodes by explicit order..."; + auto tv = LOG_CPU_TIMED_VERBOSE; + order.by_explicit_order(span, root_path_, opts); + tv << prefix << span.size() << " inodes ordered"; + break; + } } return span; } inode_manager::inode_manager(logger& lgr, progress& prog, + fs::path const& root_path, inode_options const& opts) : impl_(make_unique_logging_object(lgr, prog, opts)) {} + logger_policies>(lgr, prog, root_path, + opts)) {} } // namespace dwarfs::writer::internal diff --git a/src/writer/internal/inode_ordering.cpp b/src/writer/internal/inode_ordering.cpp index b1fd1ec5..8493eb1e 100644 --- a/src/writer/internal/inode_ordering.cpp +++ b/src/writer/internal/inode_ordering.cpp @@ -36,6 +36,7 @@ namespace dwarfs::writer::internal { using namespace dwarfs::internal; +namespace fs = std::filesystem; namespace { @@ -63,6 +64,8 @@ class inode_ordering_ final : public inode_ordering::impl { void by_nilsimsa(worker_group& wg, similarity_ordering_options const& opts, sortable_inode_span& sp, fragment_category cat) const override; + void by_explicit_order(sortable_inode_span& sp, fs::path const& root_path, + fragment_order_options const& opts) const override; private: void @@ -207,6 +210,46 @@ void inode_ordering_::by_nilsimsa_impl( future.get().swap(index); } +template +void inode_ordering_::by_explicit_order( + sortable_inode_span& sp, fs::path const& root_path, + fragment_order_options const& opts) const { + auto raw = sp.raw(); + auto& index = sp.index(); + auto const& order = opts.explicit_order; + + if (order.empty()) { + LOG_WARN << "empty explicit order file set"; + } + + std::vector paths; + std::vector> path_order; + paths.resize(raw.size()); + path_order.resize(raw.size()); + + for (auto i : index) { + paths[i] = fs::relative(raw[i]->any()->fs_path(), root_path); + + if (auto it = order.find(paths[i]); it != order.end()) { + path_order[i] = it->second; + } else { + LOG_DEBUG << "explicit order: " << paths[i] + << " not found in explicit order file"; + } + } + + std::sort(index.begin(), index.end(), [&](auto a, auto b) { + auto const& ai = path_order[a]; + auto const& bi = path_order[b]; + return ai.has_value() && bi.has_value() ? *ai < *bi + : raw[a]->num() < raw[b]->num(); + }); + + for (auto i : index) { + LOG_DEBUG << "explicit order: " << paths[i]; + } +} + inode_ordering::inode_ordering(logger& lgr, progress& prog, inode_options const& opts) : impl_(make_unique_logging_object::scan( prog.set_status_function(status_string); - inode_manager im(LOG_GET_LOGGER, prog, options_.inode); + inode_manager im(LOG_GET_LOGGER, prog, path, options_.inode); file_scanner fs(LOG_GET_LOGGER, wg_, os_, im, prog, {.hash_algo = options_.file_hash_algorithm, .debug_inode_create = os_.getenv(kEnvVarDumpFilesRaw) ||