Add reverse path ordering

This commit is contained in:
Marcus Holland-Moritz 2023-08-19 16:18:27 +02:00
parent 1e80009d45
commit 7be9aa7585
7 changed files with 49 additions and 13 deletions

View File

@ -251,14 +251,18 @@ Most other options are concerned with compression tuning:
"normalize" the permissions across the file system; this is equivalent to
using `--chmod=ug-st,=Xr`.
- `--order=none`|`path`|`similarity`|`nilsimsa`[`:`*max-children*[`:`*max-cluster-size*]]:
- `--order=none`|`path`|`revpath`|`similarity`|`nilsimsa`[`:`*max-children*[`:`*max-cluster-size*]]:
The order in which inodes will be written to the file system. Choosing `none`,
the inodes will be stored in the order in which they are discovered. With
`path`, they will be sorted asciibetically by path name of the first file
representing this inode. With `similarity`, they will be ordered using a
simple, yet fast and efficient, similarity hash function. `nilsimsa` ordering
uses a more sophisticated similarity function that is typically better than
`similarity`, but it's significantly slower to determine a good ordering.
representing this inode. With `revpath`, they will also be ordered by path
name, but the path is being traversed from the leaf to the root, i.e. files
with the same name will be sorted next to each other. With `similarity`, they
will be ordered using a simple, yet fast and efficient, similarity hash
function.
`nilsimsa` ordering uses a more sophisticated similarity function that is
typically better than `similarity`, but can be significantly slower to
determine a good ordering.
However, the new implementation of this algorithm can be parallelized and
will perform much better on huge numbers of files. `nilsimsa` ordering can
be tweaked by specifying a *max-children* and *max-cluster-size*. Both options

View File

@ -44,6 +44,10 @@ class inode_ordering {
void by_path(sortable_inode_span& sp) const { impl_->by_path(sp); }
void by_reverse_path(sortable_inode_span& sp) const {
impl_->by_reverse_path(sp);
}
void
by_similarity(sortable_inode_span& sp,
std::optional<fragment_category> cat = std::nullopt) const {
@ -62,6 +66,7 @@ class inode_ordering {
virtual void by_inode_number(sortable_inode_span& sp) const = 0;
virtual void by_path(sortable_inode_span& sp) const = 0;
virtual void by_reverse_path(sortable_inode_span& sp) const = 0;
virtual void by_similarity(sortable_inode_span& sp,
std::optional<fragment_category> cat) const = 0;
virtual void

View File

@ -78,7 +78,7 @@ struct filesystem_writer_options {
};
// TODO: rename
enum class file_order_mode { NONE, PATH, SIMILARITY, NILSIMSA };
enum class file_order_mode { NONE, PATH, REVPATH, SIMILARITY, NILSIMSA };
// TODO: rename
struct file_order_options {

View File

@ -36,6 +36,7 @@ namespace {
const std::map<std::string_view, file_order_mode> order_choices{
{"none", file_order_mode::NONE},
{"path", file_order_mode::PATH},
{"revpath", file_order_mode::REVPATH},
{"similarity", file_order_mode::SIMILARITY},
{"nilsimsa", file_order_mode::NILSIMSA},
};
@ -133,6 +134,9 @@ fragment_order_parser::to_string(file_order_options const& opts) const {
case file_order_mode::PATH:
return "path";
case file_order_mode::REVPATH:
return "revpath";
case file_order_mode::SIMILARITY:
return "similarity";

View File

@ -336,6 +336,7 @@ class inode_ : public inode {
switch (opts.fragment_order.get(f.category()).mode) {
case file_order_mode::NONE:
case file_order_mode::PATH:
case file_order_mode::REVPATH:
break;
case file_order_mode::SIMILARITY:
sc.try_emplace(f.category());
@ -391,6 +392,7 @@ class inode_ : public inode {
switch (order_mode) {
case file_order_mode::NONE:
case file_order_mode::PATH:
case file_order_mode::REVPATH:
break;
case file_order_mode::SIMILARITY: {
@ -616,6 +618,15 @@ auto inode_manager_<LoggerPolicy>::ordered_span(fragment_category cat,
break;
}
case file_order_mode::REVPATH: {
LOG_VERBOSE << prefix << "ordering " << span.size()
<< " inodes by reverse path name...";
auto tv = LOG_CPU_TIMED_VERBOSE;
order.by_reverse_path(span);
tv << prefix << span.size() << " inodes ordered";
break;
}
case file_order_mode::SIMILARITY: {
LOG_VERBOSE << prefix << "ordering " << span.size()
<< " inodes by similarity...";

View File

@ -42,6 +42,7 @@ class inode_ordering_ final : public inode_ordering::impl {
void by_inode_number(sortable_inode_span& sp) const override;
void by_path(sortable_inode_span& sp) const override;
void by_reverse_path(sortable_inode_span& sp) const override;
void by_similarity(sortable_inode_span& sp,
std::optional<fragment_category> cat) const override;
void by_nilsimsa(worker_group& wg, similarity_ordering_options const& opts,
@ -78,6 +79,17 @@ void inode_ordering_<LoggerPolicy>::by_path(sortable_inode_span& sp) const {
[&](auto a, auto b) { return paths[a] < paths[b]; });
}
template <typename LoggerPolicy>
void inode_ordering_<LoggerPolicy>::by_reverse_path(
sortable_inode_span& sp) const {
auto raw = sp.raw();
auto& index = sp.index();
std::sort(index.begin(), index.end(), [&](auto a, auto b) {
return raw[a]->any()->less_revpath(*raw[b]->any());
});
}
template <typename LoggerPolicy>
void inode_ordering_<LoggerPolicy>::by_similarity(
sortable_inode_span& sp, std::optional<fragment_category> cat) const {

View File

@ -604,13 +604,12 @@ TEST_P(packing_test, regression_empty_fs) {
INSTANTIATE_TEST_SUITE_P(
dwarfs, compression_test,
::testing::Combine(::testing::ValuesIn(compressions),
::testing::Values(12, 15, 20, 28),
::testing::Values(file_order_mode::NONE,
file_order_mode::PATH,
file_order_mode::NILSIMSA,
file_order_mode::SIMILARITY),
::testing::Values(std::nullopt, "xxh3-128")));
::testing::Combine(
::testing::ValuesIn(compressions), ::testing::Values(12, 15, 20, 28),
::testing::Values(file_order_mode::NONE, file_order_mode::PATH,
file_order_mode::REVPATH, file_order_mode::NILSIMSA,
file_order_mode::SIMILARITY),
::testing::Values(std::nullopt, "xxh3-128")));
INSTANTIATE_TEST_SUITE_P(
dwarfs, scanner_test,
@ -806,6 +805,7 @@ TEST_P(file_scanner, inode_ordering) {
INSTANTIATE_TEST_SUITE_P(
dwarfs, file_scanner,
::testing::Combine(::testing::Values(file_order_mode::PATH,
file_order_mode::REVPATH,
file_order_mode::SIMILARITY),
::testing::Values(std::nullopt, "xxh3-128")));