Fix scanning logic in inode manager to perform sequential scans

This commit is contained in:
Marcus Holland-Moritz 2023-08-13 09:47:56 +02:00
parent 6866a2fef7
commit f7d3fca9e2

View File

@ -160,39 +160,28 @@ class inode_ : public inode {
catjob.set_total_size(mm->size()); catjob.set_total_size(mm->size());
catjob.categorize_random_access(mm->span()); catjob.categorize_random_access(mm->span());
if (catjob.best_result_found()) { if (!catjob.best_result_found()) {
// This means the job won't be running any sequential categorizers // We must perform a sequential categorizer scan before scanning the
// as the outcome cannot possibly be any better. As a consequence, // fragments, because the ordering is category-dependent.
// we can already fetch the result here and scan the fragments // TODO: we might be able to get away with a single scan if we
// instead of the whole file. // optimistically assume the default category and perform
// both the sequential scan and the default-category order
// scan in parallel
scan_range(
mm, [&catjob](auto span) { catjob.categorize_sequential(span); });
}
fragments_ = catjob.result(); fragments_ = catjob.result();
if (fragments_.size() > 1) { if (fragments_.size() > 1) {
scan_fragments(mm, opts); scan_fragments(mm, opts);
} else { }
}
if (fragments_.size() <= 1) {
scan_full(mm, opts); scan_full(mm, opts);
} }
} }
}
if (fragments_.empty()) {
// If we get here, we haven't scanned anything yet, and we don't know
// if the file will be fragmented or not.
scan_full(mm, opts);
if (catjob) {
fragments_ = catjob.result();
if (fragments_.size() > 1) {
// This is the unfortunate case where we have to scan the
// individual fragments after having already done a full scan.
scan_fragments(mm, opts);
}
}
}
}
// Add a fragment if nothing has been added so far. We need a single // Add a fragment if nothing has been added so far. We need a single
// fragment to store the inode's chunks. This won't use up any resources // fragment to store the inode's chunks. This won't use up any resources
@ -329,6 +318,11 @@ class inode_ : public inode {
scanner(mm->span(offset, size)); scanner(mm->span(offset, size));
} }
template <typename T>
void scan_range(mmif* mm, T&& scanner) {
scan_range(mm, 0, mm->size(), std::forward<T>(scanner));
}
void scan_fragments(mmif* mm, inode_options const& opts) { void scan_fragments(mmif* mm, inode_options const& opts) {
assert(mm); assert(mm);
assert(fragments_.size() > 1); assert(fragments_.size() > 1);
@ -400,14 +394,14 @@ class inode_ : public inode {
case file_order_mode::SIMILARITY: { case file_order_mode::SIMILARITY: {
similarity sc; similarity sc;
scan_range(mm, 0, mm->size(), sc); scan_range(mm, sc);
similarity_hash_ = sc.finalize(); // TODO similarity_hash_ = sc.finalize(); // TODO
similarity_.emplace<uint32_t>(sc.finalize()); similarity_.emplace<uint32_t>(sc.finalize());
} break; } break;
case file_order_mode::NILSIMSA: { case file_order_mode::NILSIMSA: {
nilsimsa nc; nilsimsa nc;
scan_range(mm, 0, mm->size(), nc); scan_range(mm, nc);
// TODO: can we finalize in-place? // TODO: can we finalize in-place?
nilsimsa::hash_type hash; nilsimsa::hash_type hash;
nc.finalize(hash); nc.finalize(hash);