mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-10 13:04:15 -04:00
Add --max-similarity-size option
This commit is contained in:
parent
21fc4c9524
commit
ff5f99f3d9
@ -252,6 +252,13 @@ Most other options are concerned with compression tuning:
|
|||||||
Last but not least, if scripting support is built into `mkdwarfs`, you can
|
Last but not least, if scripting support is built into `mkdwarfs`, you can
|
||||||
choose `script` to let the script determine the order.
|
choose `script` to let the script determine the order.
|
||||||
|
|
||||||
|
- `--max-similarity-size=`*value*:
|
||||||
|
Don't perform similarity ordering for files larger than this size. This
|
||||||
|
helps speed up scanning, especially on slow file systems. For large files,
|
||||||
|
the gains from similarity ordering are relatively small. When this option
|
||||||
|
is set to a non-zero value, files larger than the limit will be stored first,
|
||||||
|
ordered by size in descending order.
|
||||||
|
|
||||||
- `-F`, `--filter=`*rule*:
|
- `-F`, `--filter=`*rule*:
|
||||||
Add a filter rule. This option can be specified multiple times.
|
Add a filter rule. This option can be specified multiple times.
|
||||||
See [FILTER RULES](#filter-rules) for more details.
|
See [FILTER RULES](#filter-rules) for more details.
|
||||||
|
@ -46,6 +46,8 @@ class inode : public object {
|
|||||||
using files_vector = folly::small_vector<file*, 1>;
|
using files_vector = folly::small_vector<file*, 1>;
|
||||||
|
|
||||||
virtual void set_files(files_vector&& fv) = 0;
|
virtual void set_files(files_vector&& fv) = 0;
|
||||||
|
virtual bool needs_scan(inode_options const& opts, size_t size) const = 0;
|
||||||
|
virtual void set_similarity_valid(inode_options const& opts) = 0;
|
||||||
virtual void
|
virtual void
|
||||||
scan(std::shared_ptr<mmif> const& mm, inode_options const& options) = 0;
|
scan(std::shared_ptr<mmif> const& mm, inode_options const& options) = 0;
|
||||||
virtual void set_num(uint32_t num) = 0;
|
virtual void set_num(uint32_t num) = 0;
|
||||||
|
@ -75,6 +75,7 @@ struct filesystem_writer_options {
|
|||||||
struct inode_options {
|
struct inode_options {
|
||||||
bool with_similarity{false};
|
bool with_similarity{false};
|
||||||
bool with_nilsimsa{false};
|
bool with_nilsimsa{false};
|
||||||
|
std::optional<size_t> max_similarity_scan_size;
|
||||||
|
|
||||||
bool needs_scan() const { return with_similarity || with_nilsimsa; }
|
bool needs_scan() const { return with_similarity || with_nilsimsa; }
|
||||||
};
|
};
|
||||||
|
@ -308,7 +308,7 @@ void file_scanner_::add_inode(file* p) {
|
|||||||
|
|
||||||
p->set_inode(inode);
|
p->set_inode(inode);
|
||||||
|
|
||||||
if (ino_opts_.needs_scan()) {
|
if (inode->needs_scan(ino_opts_, p->size())) {
|
||||||
wg_.add_job([this, p, inode = std::move(inode)] {
|
wg_.add_job([this, p, inode = std::move(inode)] {
|
||||||
std::shared_ptr<mmif> mm;
|
std::shared_ptr<mmif> mm;
|
||||||
auto const size = p->size();
|
auto const size = p->size();
|
||||||
@ -322,6 +322,7 @@ void file_scanner_::add_inode(file* p) {
|
|||||||
++prog_.files_scanned;
|
++prog_.files_scanned;
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
|
inode->set_similarity_valid(ino_opts_);
|
||||||
++prog_.inodes_scanned;
|
++prog_.inodes_scanned;
|
||||||
++prog_.files_scanned;
|
++prog_.files_scanned;
|
||||||
}
|
}
|
||||||
|
@ -23,6 +23,7 @@
|
|||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
|
#include <cstring>
|
||||||
#include <deque>
|
#include <deque>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
@ -105,7 +106,7 @@ class inode_ : public inode {
|
|||||||
uint32_t similarity_hash() const override {
|
uint32_t similarity_hash() const override {
|
||||||
assert(similarity_valid_);
|
assert(similarity_valid_);
|
||||||
if (files_.empty()) {
|
if (files_.empty()) {
|
||||||
DWARFS_THROW(runtime_error, "inode has no file");
|
DWARFS_THROW(runtime_error, "inode has no file (similarity)");
|
||||||
}
|
}
|
||||||
return similarity_hash_;
|
return similarity_hash_;
|
||||||
}
|
}
|
||||||
@ -113,7 +114,7 @@ class inode_ : public inode {
|
|||||||
nilsimsa::hash_type const& nilsimsa_similarity_hash() const override {
|
nilsimsa::hash_type const& nilsimsa_similarity_hash() const override {
|
||||||
assert(nilsimsa_valid_);
|
assert(nilsimsa_valid_);
|
||||||
if (files_.empty()) {
|
if (files_.empty()) {
|
||||||
DWARFS_THROW(runtime_error, "inode has no file");
|
DWARFS_THROW(runtime_error, "inode has no file (nilsimsa)");
|
||||||
}
|
}
|
||||||
return nilsimsa_similarity_hash_;
|
return nilsimsa_similarity_hash_;
|
||||||
}
|
}
|
||||||
@ -126,53 +127,66 @@ class inode_ : public inode {
|
|||||||
files_ = std::move(fv);
|
files_ = std::move(fv);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool needs_scan(inode_options const& opts, size_t size) const override {
|
||||||
|
return opts.needs_scan() && (!opts.max_similarity_scan_size ||
|
||||||
|
size <= opts.max_similarity_scan_size.value());
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
set_similarity_valid(inode_options const& opts [[maybe_unused]]) override {
|
||||||
|
#ifndef NDEBUG
|
||||||
|
assert(!similarity_valid_);
|
||||||
|
assert(!nilsimsa_valid_);
|
||||||
|
similarity_valid_ = opts.with_similarity;
|
||||||
|
nilsimsa_valid_ = opts.with_nilsimsa;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
scan(std::shared_ptr<mmif> const& mm, inode_options const& opts) override {
|
scan(std::shared_ptr<mmif> const& mm, inode_options const& opts) override {
|
||||||
assert(!similarity_valid_);
|
assert(!similarity_valid_);
|
||||||
assert(!nilsimsa_valid_);
|
assert(!nilsimsa_valid_);
|
||||||
|
|
||||||
if (opts.needs_scan()) {
|
similarity sc;
|
||||||
similarity sc;
|
nilsimsa nc;
|
||||||
nilsimsa nc;
|
|
||||||
|
|
||||||
if (mm) {
|
if (mm) {
|
||||||
auto update_hashes = [&](uint8_t const* data, size_t size) {
|
auto update_hashes = [&](uint8_t const* data, size_t size) {
|
||||||
if (opts.with_similarity) {
|
if (opts.with_similarity) {
|
||||||
sc.update(data, size);
|
sc.update(data, size);
|
||||||
}
|
|
||||||
|
|
||||||
if (opts.with_nilsimsa) {
|
|
||||||
nc.update(data, size);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
constexpr size_t chunk_size = 32 << 20;
|
|
||||||
size_t offset = 0;
|
|
||||||
size_t size = mm->size();
|
|
||||||
|
|
||||||
while (size >= chunk_size) {
|
|
||||||
update_hashes(mm->as<uint8_t>(offset), chunk_size);
|
|
||||||
mm->release_until(offset);
|
|
||||||
offset += chunk_size;
|
|
||||||
size -= chunk_size;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
update_hashes(mm->as<uint8_t>(offset), size);
|
if (opts.with_nilsimsa) {
|
||||||
|
nc.update(data, size);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
constexpr size_t chunk_size = 32 << 20;
|
||||||
|
size_t offset = 0;
|
||||||
|
size_t size = mm->size();
|
||||||
|
|
||||||
|
while (size >= chunk_size) {
|
||||||
|
update_hashes(mm->as<uint8_t>(offset), chunk_size);
|
||||||
|
mm->release_until(offset);
|
||||||
|
offset += chunk_size;
|
||||||
|
size -= chunk_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (opts.with_similarity) {
|
update_hashes(mm->as<uint8_t>(offset), size);
|
||||||
similarity_hash_ = sc.finalize();
|
}
|
||||||
#ifndef NDEBUG
|
|
||||||
similarity_valid_ = true;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
if (opts.with_nilsimsa) {
|
if (opts.with_similarity) {
|
||||||
nc.finalize(nilsimsa_similarity_hash_);
|
similarity_hash_ = sc.finalize();
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
nilsimsa_valid_ = true;
|
similarity_valid_ = true;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
if (opts.with_nilsimsa) {
|
||||||
|
nc.finalize(nilsimsa_similarity_hash_);
|
||||||
|
#ifndef NDEBUG
|
||||||
|
nilsimsa_valid_ = true;
|
||||||
#endif
|
#endif
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -190,7 +204,7 @@ class inode_ : public inode {
|
|||||||
|
|
||||||
file const* any() const override {
|
file const* any() const override {
|
||||||
if (files_.empty()) {
|
if (files_.empty()) {
|
||||||
DWARFS_THROW(runtime_error, "inode has no file");
|
DWARFS_THROW(runtime_error, "inode has no file (any)");
|
||||||
}
|
}
|
||||||
return files_.front();
|
return files_.front();
|
||||||
}
|
}
|
||||||
@ -413,17 +427,46 @@ void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
|
|||||||
index.resize(count);
|
index.resize(count);
|
||||||
std::iota(index.begin(), index.end(), 0);
|
std::iota(index.begin(), index.end(), 0);
|
||||||
|
|
||||||
auto empty = std::partition(index.begin(), index.end(),
|
|
||||||
[&](auto i) { return inodes[i]->size() > 0; });
|
|
||||||
|
|
||||||
auto finalize_inode = [&]() {
|
auto finalize_inode = [&]() {
|
||||||
inodes_.push_back(std::move(inodes[index.back()]));
|
inodes_.push_back(std::move(inodes[index.back()]));
|
||||||
index.pop_back();
|
index.pop_back();
|
||||||
return fn(inodes_.back());
|
return fn(inodes_.back());
|
||||||
};
|
};
|
||||||
|
|
||||||
for (auto n = std::distance(empty, index.end()); n > 0; --n) {
|
{
|
||||||
finalize_inode();
|
auto empty = std::partition(index.begin(), index.end(),
|
||||||
|
[&](auto i) { return inodes[i]->size() > 0; });
|
||||||
|
|
||||||
|
if (empty != index.end()) {
|
||||||
|
auto count = std::distance(empty, index.end());
|
||||||
|
|
||||||
|
LOG_DEBUG << "finalizing " << count << " empty inodes...";
|
||||||
|
|
||||||
|
for (auto n = count; n > 0; --n) {
|
||||||
|
finalize_inode();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
auto unhashed = std::partition(index.begin(), index.end(), [&](auto i) {
|
||||||
|
auto const& sh = inodes[i]->nilsimsa_similarity_hash();
|
||||||
|
return std::any_of(sh.begin(), sh.end(), [](auto v) { return v != 0; });
|
||||||
|
});
|
||||||
|
|
||||||
|
if (unhashed != index.end()) {
|
||||||
|
auto count = std::distance(unhashed, index.end());
|
||||||
|
|
||||||
|
std::sort(unhashed, index.end(), [&inodes](auto a, auto b) {
|
||||||
|
return inodes[a]->size() < inodes[b]->size();
|
||||||
|
});
|
||||||
|
|
||||||
|
LOG_INFO << "finalizing " << count << " unhashed inodes...";
|
||||||
|
|
||||||
|
for (auto n = count; n > 0; --n) {
|
||||||
|
finalize_inode();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!index.empty()) {
|
if (!index.empty()) {
|
||||||
|
@ -372,7 +372,7 @@ int mkdwarfs(int argc, char** argv) {
|
|||||||
std::string path, output, memory_limit, script_arg, compression, header,
|
std::string path, output, memory_limit, script_arg, compression, header,
|
||||||
schema_compression, metadata_compression, log_level_str, timestamp,
|
schema_compression, metadata_compression, log_level_str, timestamp,
|
||||||
time_resolution, order, progress_mode, recompress_opts, pack_metadata,
|
time_resolution, order, progress_mode, recompress_opts, pack_metadata,
|
||||||
file_hash_algo, debug_filter;
|
file_hash_algo, debug_filter, max_similarity_size;
|
||||||
std::vector<std::string> filter;
|
std::vector<std::string> filter;
|
||||||
size_t num_workers, num_scanner_workers;
|
size_t num_workers, num_scanner_workers;
|
||||||
bool no_progress = false, remove_header = false, no_section_index = false,
|
bool no_progress = false, remove_header = false, no_section_index = false,
|
||||||
@ -475,6 +475,9 @@ int mkdwarfs(int argc, char** argv) {
|
|||||||
("order",
|
("order",
|
||||||
po::value<std::string>(&order),
|
po::value<std::string>(&order),
|
||||||
order_desc.c_str())
|
order_desc.c_str())
|
||||||
|
("max-similarity-size",
|
||||||
|
po::value<std::string>(&max_similarity_size),
|
||||||
|
"maximum file size to compute similarity")
|
||||||
#ifdef DWARFS_HAVE_PYTHON
|
#ifdef DWARFS_HAVE_PYTHON
|
||||||
("script",
|
("script",
|
||||||
po::value<std::string>(&script_arg),
|
po::value<std::string>(&script_arg),
|
||||||
@ -716,6 +719,13 @@ int mkdwarfs(int argc, char** argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (vm.count("max-similarity-size")) {
|
||||||
|
auto size = parse_size_with_unit(max_similarity_size);
|
||||||
|
if (size > 0) {
|
||||||
|
options.inode.max_similarity_scan_size = size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
size_t mem_limit = parse_size_with_unit(memory_limit);
|
size_t mem_limit = parse_size_with_unit(memory_limit);
|
||||||
|
|
||||||
if (!vm.count("num-scanner-workers")) {
|
if (!vm.count("num-scanner-workers")) {
|
||||||
@ -1024,7 +1034,8 @@ int mkdwarfs(int argc, char** argv) {
|
|||||||
<< "': " << strerror(errno);
|
<< "': " << strerror(errno);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
} else if (auto oss = dynamic_cast<std::ostringstream*>(os.get())) {
|
} else if (auto oss [[maybe_unused]] =
|
||||||
|
dynamic_cast<std::ostringstream*>(os.get())) {
|
||||||
assert(oss->str().empty());
|
assert(oss->str().empty());
|
||||||
} else {
|
} else {
|
||||||
assert(false);
|
assert(false);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user