mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-08 03:49:44 -04:00
Add --max-similarity-size option
This commit is contained in:
parent
21fc4c9524
commit
ff5f99f3d9
@ -252,6 +252,13 @@ Most other options are concerned with compression tuning:
|
||||
Last but not least, if scripting support is built into `mkdwarfs`, you can
|
||||
choose `script` to let the script determine the order.
|
||||
|
||||
- `--max-similarity-size=`*value*:
|
||||
Don't perform similarity ordering for files larger than this size. This
|
||||
helps speed up scanning, especially on slow file systems. For large files,
|
||||
the gains from similarity ordering are relatively small. When this option
|
||||
is set to a non-zero value, files larger than the limit will be stored first,
|
||||
ordered by size in descending order.
|
||||
|
||||
- `-F`, `--filter=`*rule*:
|
||||
Add a filter rule. This option can be specified multiple times.
|
||||
See [FILTER RULES](#filter-rules) for more details.
|
||||
|
@ -46,6 +46,8 @@ class inode : public object {
|
||||
using files_vector = folly::small_vector<file*, 1>;
|
||||
|
||||
virtual void set_files(files_vector&& fv) = 0;
|
||||
virtual bool needs_scan(inode_options const& opts, size_t size) const = 0;
|
||||
virtual void set_similarity_valid(inode_options const& opts) = 0;
|
||||
virtual void
|
||||
scan(std::shared_ptr<mmif> const& mm, inode_options const& options) = 0;
|
||||
virtual void set_num(uint32_t num) = 0;
|
||||
|
@ -75,6 +75,7 @@ struct filesystem_writer_options {
|
||||
struct inode_options {
|
||||
bool with_similarity{false};
|
||||
bool with_nilsimsa{false};
|
||||
std::optional<size_t> max_similarity_scan_size;
|
||||
|
||||
bool needs_scan() const { return with_similarity || with_nilsimsa; }
|
||||
};
|
||||
|
@ -308,7 +308,7 @@ void file_scanner_::add_inode(file* p) {
|
||||
|
||||
p->set_inode(inode);
|
||||
|
||||
if (ino_opts_.needs_scan()) {
|
||||
if (inode->needs_scan(ino_opts_, p->size())) {
|
||||
wg_.add_job([this, p, inode = std::move(inode)] {
|
||||
std::shared_ptr<mmif> mm;
|
||||
auto const size = p->size();
|
||||
@ -322,6 +322,7 @@ void file_scanner_::add_inode(file* p) {
|
||||
++prog_.files_scanned;
|
||||
});
|
||||
} else {
|
||||
inode->set_similarity_valid(ino_opts_);
|
||||
++prog_.inodes_scanned;
|
||||
++prog_.files_scanned;
|
||||
}
|
||||
|
@ -23,6 +23,7 @@
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <deque>
|
||||
#include <fstream>
|
||||
#include <limits>
|
||||
@ -105,7 +106,7 @@ class inode_ : public inode {
|
||||
uint32_t similarity_hash() const override {
|
||||
assert(similarity_valid_);
|
||||
if (files_.empty()) {
|
||||
DWARFS_THROW(runtime_error, "inode has no file");
|
||||
DWARFS_THROW(runtime_error, "inode has no file (similarity)");
|
||||
}
|
||||
return similarity_hash_;
|
||||
}
|
||||
@ -113,7 +114,7 @@ class inode_ : public inode {
|
||||
nilsimsa::hash_type const& nilsimsa_similarity_hash() const override {
|
||||
assert(nilsimsa_valid_);
|
||||
if (files_.empty()) {
|
||||
DWARFS_THROW(runtime_error, "inode has no file");
|
||||
DWARFS_THROW(runtime_error, "inode has no file (nilsimsa)");
|
||||
}
|
||||
return nilsimsa_similarity_hash_;
|
||||
}
|
||||
@ -126,53 +127,66 @@ class inode_ : public inode {
|
||||
files_ = std::move(fv);
|
||||
}
|
||||
|
||||
bool needs_scan(inode_options const& opts, size_t size) const override {
|
||||
return opts.needs_scan() && (!opts.max_similarity_scan_size ||
|
||||
size <= opts.max_similarity_scan_size.value());
|
||||
}
|
||||
|
||||
void
|
||||
set_similarity_valid(inode_options const& opts [[maybe_unused]]) override {
|
||||
#ifndef NDEBUG
|
||||
assert(!similarity_valid_);
|
||||
assert(!nilsimsa_valid_);
|
||||
similarity_valid_ = opts.with_similarity;
|
||||
nilsimsa_valid_ = opts.with_nilsimsa;
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
scan(std::shared_ptr<mmif> const& mm, inode_options const& opts) override {
|
||||
assert(!similarity_valid_);
|
||||
assert(!nilsimsa_valid_);
|
||||
|
||||
if (opts.needs_scan()) {
|
||||
similarity sc;
|
||||
nilsimsa nc;
|
||||
similarity sc;
|
||||
nilsimsa nc;
|
||||
|
||||
if (mm) {
|
||||
auto update_hashes = [&](uint8_t const* data, size_t size) {
|
||||
if (opts.with_similarity) {
|
||||
sc.update(data, size);
|
||||
}
|
||||
|
||||
if (opts.with_nilsimsa) {
|
||||
nc.update(data, size);
|
||||
}
|
||||
};
|
||||
|
||||
constexpr size_t chunk_size = 32 << 20;
|
||||
size_t offset = 0;
|
||||
size_t size = mm->size();
|
||||
|
||||
while (size >= chunk_size) {
|
||||
update_hashes(mm->as<uint8_t>(offset), chunk_size);
|
||||
mm->release_until(offset);
|
||||
offset += chunk_size;
|
||||
size -= chunk_size;
|
||||
if (mm) {
|
||||
auto update_hashes = [&](uint8_t const* data, size_t size) {
|
||||
if (opts.with_similarity) {
|
||||
sc.update(data, size);
|
||||
}
|
||||
|
||||
update_hashes(mm->as<uint8_t>(offset), size);
|
||||
if (opts.with_nilsimsa) {
|
||||
nc.update(data, size);
|
||||
}
|
||||
};
|
||||
|
||||
constexpr size_t chunk_size = 32 << 20;
|
||||
size_t offset = 0;
|
||||
size_t size = mm->size();
|
||||
|
||||
while (size >= chunk_size) {
|
||||
update_hashes(mm->as<uint8_t>(offset), chunk_size);
|
||||
mm->release_until(offset);
|
||||
offset += chunk_size;
|
||||
size -= chunk_size;
|
||||
}
|
||||
|
||||
if (opts.with_similarity) {
|
||||
similarity_hash_ = sc.finalize();
|
||||
#ifndef NDEBUG
|
||||
similarity_valid_ = true;
|
||||
#endif
|
||||
}
|
||||
update_hashes(mm->as<uint8_t>(offset), size);
|
||||
}
|
||||
|
||||
if (opts.with_nilsimsa) {
|
||||
nc.finalize(nilsimsa_similarity_hash_);
|
||||
if (opts.with_similarity) {
|
||||
similarity_hash_ = sc.finalize();
|
||||
#ifndef NDEBUG
|
||||
nilsimsa_valid_ = true;
|
||||
similarity_valid_ = true;
|
||||
#endif
|
||||
}
|
||||
|
||||
if (opts.with_nilsimsa) {
|
||||
nc.finalize(nilsimsa_similarity_hash_);
|
||||
#ifndef NDEBUG
|
||||
nilsimsa_valid_ = true;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -190,7 +204,7 @@ class inode_ : public inode {
|
||||
|
||||
file const* any() const override {
|
||||
if (files_.empty()) {
|
||||
DWARFS_THROW(runtime_error, "inode has no file");
|
||||
DWARFS_THROW(runtime_error, "inode has no file (any)");
|
||||
}
|
||||
return files_.front();
|
||||
}
|
||||
@ -413,17 +427,46 @@ void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
|
||||
index.resize(count);
|
||||
std::iota(index.begin(), index.end(), 0);
|
||||
|
||||
auto empty = std::partition(index.begin(), index.end(),
|
||||
[&](auto i) { return inodes[i]->size() > 0; });
|
||||
|
||||
auto finalize_inode = [&]() {
|
||||
inodes_.push_back(std::move(inodes[index.back()]));
|
||||
index.pop_back();
|
||||
return fn(inodes_.back());
|
||||
};
|
||||
|
||||
for (auto n = std::distance(empty, index.end()); n > 0; --n) {
|
||||
finalize_inode();
|
||||
{
|
||||
auto empty = std::partition(index.begin(), index.end(),
|
||||
[&](auto i) { return inodes[i]->size() > 0; });
|
||||
|
||||
if (empty != index.end()) {
|
||||
auto count = std::distance(empty, index.end());
|
||||
|
||||
LOG_DEBUG << "finalizing " << count << " empty inodes...";
|
||||
|
||||
for (auto n = count; n > 0; --n) {
|
||||
finalize_inode();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
auto unhashed = std::partition(index.begin(), index.end(), [&](auto i) {
|
||||
auto const& sh = inodes[i]->nilsimsa_similarity_hash();
|
||||
return std::any_of(sh.begin(), sh.end(), [](auto v) { return v != 0; });
|
||||
});
|
||||
|
||||
if (unhashed != index.end()) {
|
||||
auto count = std::distance(unhashed, index.end());
|
||||
|
||||
std::sort(unhashed, index.end(), [&inodes](auto a, auto b) {
|
||||
return inodes[a]->size() < inodes[b]->size();
|
||||
});
|
||||
|
||||
LOG_INFO << "finalizing " << count << " unhashed inodes...";
|
||||
|
||||
for (auto n = count; n > 0; --n) {
|
||||
finalize_inode();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!index.empty()) {
|
||||
|
@ -372,7 +372,7 @@ int mkdwarfs(int argc, char** argv) {
|
||||
std::string path, output, memory_limit, script_arg, compression, header,
|
||||
schema_compression, metadata_compression, log_level_str, timestamp,
|
||||
time_resolution, order, progress_mode, recompress_opts, pack_metadata,
|
||||
file_hash_algo, debug_filter;
|
||||
file_hash_algo, debug_filter, max_similarity_size;
|
||||
std::vector<std::string> filter;
|
||||
size_t num_workers, num_scanner_workers;
|
||||
bool no_progress = false, remove_header = false, no_section_index = false,
|
||||
@ -475,6 +475,9 @@ int mkdwarfs(int argc, char** argv) {
|
||||
("order",
|
||||
po::value<std::string>(&order),
|
||||
order_desc.c_str())
|
||||
("max-similarity-size",
|
||||
po::value<std::string>(&max_similarity_size),
|
||||
"maximum file size to compute similarity")
|
||||
#ifdef DWARFS_HAVE_PYTHON
|
||||
("script",
|
||||
po::value<std::string>(&script_arg),
|
||||
@ -716,6 +719,13 @@ int mkdwarfs(int argc, char** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (vm.count("max-similarity-size")) {
|
||||
auto size = parse_size_with_unit(max_similarity_size);
|
||||
if (size > 0) {
|
||||
options.inode.max_similarity_scan_size = size;
|
||||
}
|
||||
}
|
||||
|
||||
size_t mem_limit = parse_size_with_unit(memory_limit);
|
||||
|
||||
if (!vm.count("num-scanner-workers")) {
|
||||
@ -1024,7 +1034,8 @@ int mkdwarfs(int argc, char** argv) {
|
||||
<< "': " << strerror(errno);
|
||||
return 1;
|
||||
}
|
||||
} else if (auto oss = dynamic_cast<std::ostringstream*>(os.get())) {
|
||||
} else if (auto oss [[maybe_unused]] =
|
||||
dynamic_cast<std::ostringstream*>(os.get())) {
|
||||
assert(oss->str().empty());
|
||||
} else {
|
||||
assert(false);
|
||||
|
Loading…
x
Reference in New Issue
Block a user