From 7be9aa7585fdca2ead9d8f8a4202c38e0cf88f49 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Sat, 19 Aug 2023 16:18:27 +0200 Subject: [PATCH] Add reverse path ordering --- doc/mkdwarfs.md | 14 +++++++++----- include/dwarfs/inode_ordering.h | 5 +++++ include/dwarfs/options.h | 2 +- src/dwarfs/fragment_order_parser.cpp | 4 ++++ src/dwarfs/inode_manager.cpp | 11 +++++++++++ src/dwarfs/inode_ordering.cpp | 12 ++++++++++++ test/dwarfs.cpp | 14 +++++++------- 7 files changed, 49 insertions(+), 13 deletions(-) diff --git a/doc/mkdwarfs.md b/doc/mkdwarfs.md index ed16c5c8..0c567411 100644 --- a/doc/mkdwarfs.md +++ b/doc/mkdwarfs.md @@ -251,14 +251,18 @@ Most other options are concerned with compression tuning: "normalize" the permissions across the file system; this is equivalent to using `--chmod=ug-st,=Xr`. -- `--order=none`|`path`|`similarity`|`nilsimsa`[`:`*max-children*[`:`*max-cluster-size*]]: +- `--order=none`|`path`|`revpath`|`similarity`|`nilsimsa`[`:`*max-children*[`:`*max-cluster-size*]]: The order in which inodes will be written to the file system. Choosing `none`, the inodes will be stored in the order in which they are discovered. With `path`, they will be sorted asciibetically by path name of the first file - representing this inode. With `similarity`, they will be ordered using a - simple, yet fast and efficient, similarity hash function. `nilsimsa` ordering - uses a more sophisticated similarity function that is typically better than - `similarity`, but it's significantly slower to determine a good ordering. + representing this inode. With `revpath`, they will also be ordered by path + name, but the path is being traversed from the leaf to the root, i.e. files + with the same name will be sorted next to each other. With `similarity`, they + will be ordered using a simple, yet fast and efficient, similarity hash + function. + `nilsimsa` ordering uses a more sophisticated similarity function that is + typically better than `similarity`, but can be significantly slower to + determine a good ordering. However, the new implementation of this algorithm can be parallelized and will perform much better on huge numbers of files. `nilsimsa` ordering can be tweaked by specifying a *max-children* and *max-cluster-size*. Both options diff --git a/include/dwarfs/inode_ordering.h b/include/dwarfs/inode_ordering.h index 0de45264..c96ec38a 100644 --- a/include/dwarfs/inode_ordering.h +++ b/include/dwarfs/inode_ordering.h @@ -44,6 +44,10 @@ class inode_ordering { void by_path(sortable_inode_span& sp) const { impl_->by_path(sp); } + void by_reverse_path(sortable_inode_span& sp) const { + impl_->by_reverse_path(sp); + } + void by_similarity(sortable_inode_span& sp, std::optional cat = std::nullopt) const { @@ -62,6 +66,7 @@ class inode_ordering { virtual void by_inode_number(sortable_inode_span& sp) const = 0; virtual void by_path(sortable_inode_span& sp) const = 0; + virtual void by_reverse_path(sortable_inode_span& sp) const = 0; virtual void by_similarity(sortable_inode_span& sp, std::optional cat) const = 0; virtual void diff --git a/include/dwarfs/options.h b/include/dwarfs/options.h index a6654c08..b4fa7e3f 100644 --- a/include/dwarfs/options.h +++ b/include/dwarfs/options.h @@ -78,7 +78,7 @@ struct filesystem_writer_options { }; // TODO: rename -enum class file_order_mode { NONE, PATH, SIMILARITY, NILSIMSA }; +enum class file_order_mode { NONE, PATH, REVPATH, SIMILARITY, NILSIMSA }; // TODO: rename struct file_order_options { diff --git a/src/dwarfs/fragment_order_parser.cpp b/src/dwarfs/fragment_order_parser.cpp index fc490358..b0690d16 100644 --- a/src/dwarfs/fragment_order_parser.cpp +++ b/src/dwarfs/fragment_order_parser.cpp @@ -36,6 +36,7 @@ namespace { const std::map order_choices{ {"none", file_order_mode::NONE}, {"path", file_order_mode::PATH}, + {"revpath", file_order_mode::REVPATH}, {"similarity", file_order_mode::SIMILARITY}, {"nilsimsa", file_order_mode::NILSIMSA}, }; @@ -133,6 +134,9 @@ fragment_order_parser::to_string(file_order_options const& opts) const { case file_order_mode::PATH: return "path"; + case file_order_mode::REVPATH: + return "revpath"; + case file_order_mode::SIMILARITY: return "similarity"; diff --git a/src/dwarfs/inode_manager.cpp b/src/dwarfs/inode_manager.cpp index 45df9e17..cc3c435e 100644 --- a/src/dwarfs/inode_manager.cpp +++ b/src/dwarfs/inode_manager.cpp @@ -336,6 +336,7 @@ class inode_ : public inode { switch (opts.fragment_order.get(f.category()).mode) { case file_order_mode::NONE: case file_order_mode::PATH: + case file_order_mode::REVPATH: break; case file_order_mode::SIMILARITY: sc.try_emplace(f.category()); @@ -391,6 +392,7 @@ class inode_ : public inode { switch (order_mode) { case file_order_mode::NONE: case file_order_mode::PATH: + case file_order_mode::REVPATH: break; case file_order_mode::SIMILARITY: { @@ -616,6 +618,15 @@ auto inode_manager_::ordered_span(fragment_category cat, break; } + case file_order_mode::REVPATH: { + LOG_VERBOSE << prefix << "ordering " << span.size() + << " inodes by reverse path name..."; + auto tv = LOG_CPU_TIMED_VERBOSE; + order.by_reverse_path(span); + tv << prefix << span.size() << " inodes ordered"; + break; + } + case file_order_mode::SIMILARITY: { LOG_VERBOSE << prefix << "ordering " << span.size() << " inodes by similarity..."; diff --git a/src/dwarfs/inode_ordering.cpp b/src/dwarfs/inode_ordering.cpp index 114fe58a..f0c20596 100644 --- a/src/dwarfs/inode_ordering.cpp +++ b/src/dwarfs/inode_ordering.cpp @@ -42,6 +42,7 @@ class inode_ordering_ final : public inode_ordering::impl { void by_inode_number(sortable_inode_span& sp) const override; void by_path(sortable_inode_span& sp) const override; + void by_reverse_path(sortable_inode_span& sp) const override; void by_similarity(sortable_inode_span& sp, std::optional cat) const override; void by_nilsimsa(worker_group& wg, similarity_ordering_options const& opts, @@ -78,6 +79,17 @@ void inode_ordering_::by_path(sortable_inode_span& sp) const { [&](auto a, auto b) { return paths[a] < paths[b]; }); } +template +void inode_ordering_::by_reverse_path( + sortable_inode_span& sp) const { + auto raw = sp.raw(); + auto& index = sp.index(); + + std::sort(index.begin(), index.end(), [&](auto a, auto b) { + return raw[a]->any()->less_revpath(*raw[b]->any()); + }); +} + template void inode_ordering_::by_similarity( sortable_inode_span& sp, std::optional cat) const { diff --git a/test/dwarfs.cpp b/test/dwarfs.cpp index a104d496..51aa1526 100644 --- a/test/dwarfs.cpp +++ b/test/dwarfs.cpp @@ -604,13 +604,12 @@ TEST_P(packing_test, regression_empty_fs) { INSTANTIATE_TEST_SUITE_P( dwarfs, compression_test, - ::testing::Combine(::testing::ValuesIn(compressions), - ::testing::Values(12, 15, 20, 28), - ::testing::Values(file_order_mode::NONE, - file_order_mode::PATH, - file_order_mode::NILSIMSA, - file_order_mode::SIMILARITY), - ::testing::Values(std::nullopt, "xxh3-128"))); + ::testing::Combine( + ::testing::ValuesIn(compressions), ::testing::Values(12, 15, 20, 28), + ::testing::Values(file_order_mode::NONE, file_order_mode::PATH, + file_order_mode::REVPATH, file_order_mode::NILSIMSA, + file_order_mode::SIMILARITY), + ::testing::Values(std::nullopt, "xxh3-128"))); INSTANTIATE_TEST_SUITE_P( dwarfs, scanner_test, @@ -806,6 +805,7 @@ TEST_P(file_scanner, inode_ordering) { INSTANTIATE_TEST_SUITE_P( dwarfs, file_scanner, ::testing::Combine(::testing::Values(file_order_mode::PATH, + file_order_mode::REVPATH, file_order_mode::SIMILARITY), ::testing::Values(std::nullopt, "xxh3-128")));