Add --max-similarity-size option

This commit is contained in:
Marcus Holland-Moritz 2022-11-06 14:32:14 +01:00
parent 21fc4c9524
commit ff5f99f3d9
6 changed files with 110 additions and 45 deletions

View File

@ -252,6 +252,13 @@ Most other options are concerned with compression tuning:
Last but not least, if scripting support is built into `mkdwarfs`, you can
choose `script` to let the script determine the order.
- `--max-similarity-size=`*value*:
Don't perform similarity ordering for files larger than this size. This
helps speed up scanning, especially on slow file systems. For large files,
the gains from similarity ordering are relatively small. When this option
is set to a non-zero value, files larger than the limit will be stored first,
ordered by size in descending order.
- `-F`, `--filter=`*rule*:
Add a filter rule. This option can be specified multiple times.
See [FILTER RULES](#filter-rules) for more details.

View File

@ -46,6 +46,8 @@ class inode : public object {
using files_vector = folly::small_vector<file*, 1>;
virtual void set_files(files_vector&& fv) = 0;
virtual bool needs_scan(inode_options const& opts, size_t size) const = 0;
virtual void set_similarity_valid(inode_options const& opts) = 0;
virtual void
scan(std::shared_ptr<mmif> const& mm, inode_options const& options) = 0;
virtual void set_num(uint32_t num) = 0;

View File

@ -75,6 +75,7 @@ struct filesystem_writer_options {
struct inode_options {
bool with_similarity{false};
bool with_nilsimsa{false};
std::optional<size_t> max_similarity_scan_size;
bool needs_scan() const { return with_similarity || with_nilsimsa; }
};

View File

@ -308,7 +308,7 @@ void file_scanner_::add_inode(file* p) {
p->set_inode(inode);
if (ino_opts_.needs_scan()) {
if (inode->needs_scan(ino_opts_, p->size())) {
wg_.add_job([this, p, inode = std::move(inode)] {
std::shared_ptr<mmif> mm;
auto const size = p->size();
@ -322,6 +322,7 @@ void file_scanner_::add_inode(file* p) {
++prog_.files_scanned;
});
} else {
inode->set_similarity_valid(ino_opts_);
++prog_.inodes_scanned;
++prog_.files_scanned;
}

View File

@ -23,6 +23,7 @@
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <deque>
#include <fstream>
#include <limits>
@ -105,7 +106,7 @@ class inode_ : public inode {
uint32_t similarity_hash() const override {
assert(similarity_valid_);
if (files_.empty()) {
DWARFS_THROW(runtime_error, "inode has no file");
DWARFS_THROW(runtime_error, "inode has no file (similarity)");
}
return similarity_hash_;
}
@ -113,7 +114,7 @@ class inode_ : public inode {
nilsimsa::hash_type const& nilsimsa_similarity_hash() const override {
assert(nilsimsa_valid_);
if (files_.empty()) {
DWARFS_THROW(runtime_error, "inode has no file");
DWARFS_THROW(runtime_error, "inode has no file (nilsimsa)");
}
return nilsimsa_similarity_hash_;
}
@ -126,53 +127,66 @@ class inode_ : public inode {
files_ = std::move(fv);
}
bool needs_scan(inode_options const& opts, size_t size) const override {
return opts.needs_scan() && (!opts.max_similarity_scan_size ||
size <= opts.max_similarity_scan_size.value());
}
void
set_similarity_valid(inode_options const& opts [[maybe_unused]]) override {
#ifndef NDEBUG
assert(!similarity_valid_);
assert(!nilsimsa_valid_);
similarity_valid_ = opts.with_similarity;
nilsimsa_valid_ = opts.with_nilsimsa;
#endif
}
void
scan(std::shared_ptr<mmif> const& mm, inode_options const& opts) override {
assert(!similarity_valid_);
assert(!nilsimsa_valid_);
if (opts.needs_scan()) {
similarity sc;
nilsimsa nc;
similarity sc;
nilsimsa nc;
if (mm) {
auto update_hashes = [&](uint8_t const* data, size_t size) {
if (opts.with_similarity) {
sc.update(data, size);
}
if (opts.with_nilsimsa) {
nc.update(data, size);
}
};
constexpr size_t chunk_size = 32 << 20;
size_t offset = 0;
size_t size = mm->size();
while (size >= chunk_size) {
update_hashes(mm->as<uint8_t>(offset), chunk_size);
mm->release_until(offset);
offset += chunk_size;
size -= chunk_size;
if (mm) {
auto update_hashes = [&](uint8_t const* data, size_t size) {
if (opts.with_similarity) {
sc.update(data, size);
}
update_hashes(mm->as<uint8_t>(offset), size);
if (opts.with_nilsimsa) {
nc.update(data, size);
}
};
constexpr size_t chunk_size = 32 << 20;
size_t offset = 0;
size_t size = mm->size();
while (size >= chunk_size) {
update_hashes(mm->as<uint8_t>(offset), chunk_size);
mm->release_until(offset);
offset += chunk_size;
size -= chunk_size;
}
if (opts.with_similarity) {
similarity_hash_ = sc.finalize();
#ifndef NDEBUG
similarity_valid_ = true;
#endif
}
update_hashes(mm->as<uint8_t>(offset), size);
}
if (opts.with_nilsimsa) {
nc.finalize(nilsimsa_similarity_hash_);
if (opts.with_similarity) {
similarity_hash_ = sc.finalize();
#ifndef NDEBUG
nilsimsa_valid_ = true;
similarity_valid_ = true;
#endif
}
if (opts.with_nilsimsa) {
nc.finalize(nilsimsa_similarity_hash_);
#ifndef NDEBUG
nilsimsa_valid_ = true;
#endif
}
}
}
@ -190,7 +204,7 @@ class inode_ : public inode {
file const* any() const override {
if (files_.empty()) {
DWARFS_THROW(runtime_error, "inode has no file");
DWARFS_THROW(runtime_error, "inode has no file (any)");
}
return files_.front();
}
@ -413,17 +427,46 @@ void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
index.resize(count);
std::iota(index.begin(), index.end(), 0);
auto empty = std::partition(index.begin(), index.end(),
[&](auto i) { return inodes[i]->size() > 0; });
auto finalize_inode = [&]() {
inodes_.push_back(std::move(inodes[index.back()]));
index.pop_back();
return fn(inodes_.back());
};
for (auto n = std::distance(empty, index.end()); n > 0; --n) {
finalize_inode();
{
auto empty = std::partition(index.begin(), index.end(),
[&](auto i) { return inodes[i]->size() > 0; });
if (empty != index.end()) {
auto count = std::distance(empty, index.end());
LOG_DEBUG << "finalizing " << count << " empty inodes...";
for (auto n = count; n > 0; --n) {
finalize_inode();
}
}
}
{
auto unhashed = std::partition(index.begin(), index.end(), [&](auto i) {
auto const& sh = inodes[i]->nilsimsa_similarity_hash();
return std::any_of(sh.begin(), sh.end(), [](auto v) { return v != 0; });
});
if (unhashed != index.end()) {
auto count = std::distance(unhashed, index.end());
std::sort(unhashed, index.end(), [&inodes](auto a, auto b) {
return inodes[a]->size() < inodes[b]->size();
});
LOG_INFO << "finalizing " << count << " unhashed inodes...";
for (auto n = count; n > 0; --n) {
finalize_inode();
}
}
}
if (!index.empty()) {

View File

@ -372,7 +372,7 @@ int mkdwarfs(int argc, char** argv) {
std::string path, output, memory_limit, script_arg, compression, header,
schema_compression, metadata_compression, log_level_str, timestamp,
time_resolution, order, progress_mode, recompress_opts, pack_metadata,
file_hash_algo, debug_filter;
file_hash_algo, debug_filter, max_similarity_size;
std::vector<std::string> filter;
size_t num_workers, num_scanner_workers;
bool no_progress = false, remove_header = false, no_section_index = false,
@ -475,6 +475,9 @@ int mkdwarfs(int argc, char** argv) {
("order",
po::value<std::string>(&order),
order_desc.c_str())
("max-similarity-size",
po::value<std::string>(&max_similarity_size),
"maximum file size to compute similarity")
#ifdef DWARFS_HAVE_PYTHON
("script",
po::value<std::string>(&script_arg),
@ -716,6 +719,13 @@ int mkdwarfs(int argc, char** argv) {
return 1;
}
if (vm.count("max-similarity-size")) {
auto size = parse_size_with_unit(max_similarity_size);
if (size > 0) {
options.inode.max_similarity_scan_size = size;
}
}
size_t mem_limit = parse_size_with_unit(memory_limit);
if (!vm.count("num-scanner-workers")) {
@ -1024,7 +1034,8 @@ int mkdwarfs(int argc, char** argv) {
<< "': " << strerror(errno);
return 1;
}
} else if (auto oss = dynamic_cast<std::ostringstream*>(os.get())) {
} else if (auto oss [[maybe_unused]] =
dynamic_cast<std::ostringstream*>(os.get())) {
assert(oss->str().empty());
} else {
assert(false);