From ff5f99f3d92a14d26dacad79fff5e52a924abba4 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Sun, 6 Nov 2022 14:32:14 +0100 Subject: [PATCH] Add --max-similarity-size option --- doc/mkdwarfs.md | 7 ++ include/dwarfs/inode.h | 2 + include/dwarfs/options.h | 1 + src/dwarfs/file_scanner.cpp | 3 +- src/dwarfs/inode_manager.cpp | 127 +++++++++++++++++++++++------------ src/mkdwarfs.cpp | 15 ++++- 6 files changed, 110 insertions(+), 45 deletions(-) diff --git a/doc/mkdwarfs.md b/doc/mkdwarfs.md index e4d4fd8b..eb20d546 100644 --- a/doc/mkdwarfs.md +++ b/doc/mkdwarfs.md @@ -252,6 +252,13 @@ Most other options are concerned with compression tuning: Last but not least, if scripting support is built into `mkdwarfs`, you can choose `script` to let the script determine the order. +- `--max-similarity-size=`*value*: + Don't perform similarity ordering for files larger than this size. This + helps speed up scanning, especially on slow file systems. For large files, + the gains from similarity ordering are relatively small. When this option + is set to a non-zero value, files larger than the limit will be stored first, + ordered by size in descending order. + - `-F`, `--filter=`*rule*: Add a filter rule. This option can be specified multiple times. See [FILTER RULES](#filter-rules) for more details. diff --git a/include/dwarfs/inode.h b/include/dwarfs/inode.h index 0e9edd74..f0a3307e 100644 --- a/include/dwarfs/inode.h +++ b/include/dwarfs/inode.h @@ -46,6 +46,8 @@ class inode : public object { using files_vector = folly::small_vector; virtual void set_files(files_vector&& fv) = 0; + virtual bool needs_scan(inode_options const& opts, size_t size) const = 0; + virtual void set_similarity_valid(inode_options const& opts) = 0; virtual void scan(std::shared_ptr const& mm, inode_options const& options) = 0; virtual void set_num(uint32_t num) = 0; diff --git a/include/dwarfs/options.h b/include/dwarfs/options.h index d2a7b160..eb3631b3 100644 --- a/include/dwarfs/options.h +++ b/include/dwarfs/options.h @@ -75,6 +75,7 @@ struct filesystem_writer_options { struct inode_options { bool with_similarity{false}; bool with_nilsimsa{false}; + std::optional max_similarity_scan_size; bool needs_scan() const { return with_similarity || with_nilsimsa; } }; diff --git a/src/dwarfs/file_scanner.cpp b/src/dwarfs/file_scanner.cpp index 4eee8b0b..11e3c651 100644 --- a/src/dwarfs/file_scanner.cpp +++ b/src/dwarfs/file_scanner.cpp @@ -308,7 +308,7 @@ void file_scanner_::add_inode(file* p) { p->set_inode(inode); - if (ino_opts_.needs_scan()) { + if (inode->needs_scan(ino_opts_, p->size())) { wg_.add_job([this, p, inode = std::move(inode)] { std::shared_ptr mm; auto const size = p->size(); @@ -322,6 +322,7 @@ void file_scanner_::add_inode(file* p) { ++prog_.files_scanned; }); } else { + inode->set_similarity_valid(ino_opts_); ++prog_.inodes_scanned; ++prog_.files_scanned; } diff --git a/src/dwarfs/inode_manager.cpp b/src/dwarfs/inode_manager.cpp index 0f007c80..24d7867b 100644 --- a/src/dwarfs/inode_manager.cpp +++ b/src/dwarfs/inode_manager.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -105,7 +106,7 @@ class inode_ : public inode { uint32_t similarity_hash() const override { assert(similarity_valid_); if (files_.empty()) { - DWARFS_THROW(runtime_error, "inode has no file"); + DWARFS_THROW(runtime_error, "inode has no file (similarity)"); } return similarity_hash_; } @@ -113,7 +114,7 @@ class inode_ : public inode { nilsimsa::hash_type const& nilsimsa_similarity_hash() const override { assert(nilsimsa_valid_); if (files_.empty()) { - DWARFS_THROW(runtime_error, "inode has no file"); + DWARFS_THROW(runtime_error, "inode has no file (nilsimsa)"); } return nilsimsa_similarity_hash_; } @@ -126,53 +127,66 @@ class inode_ : public inode { files_ = std::move(fv); } + bool needs_scan(inode_options const& opts, size_t size) const override { + return opts.needs_scan() && (!opts.max_similarity_scan_size || + size <= opts.max_similarity_scan_size.value()); + } + + void + set_similarity_valid(inode_options const& opts [[maybe_unused]]) override { +#ifndef NDEBUG + assert(!similarity_valid_); + assert(!nilsimsa_valid_); + similarity_valid_ = opts.with_similarity; + nilsimsa_valid_ = opts.with_nilsimsa; +#endif + } + void scan(std::shared_ptr const& mm, inode_options const& opts) override { assert(!similarity_valid_); assert(!nilsimsa_valid_); - if (opts.needs_scan()) { - similarity sc; - nilsimsa nc; + similarity sc; + nilsimsa nc; - if (mm) { - auto update_hashes = [&](uint8_t const* data, size_t size) { - if (opts.with_similarity) { - sc.update(data, size); - } - - if (opts.with_nilsimsa) { - nc.update(data, size); - } - }; - - constexpr size_t chunk_size = 32 << 20; - size_t offset = 0; - size_t size = mm->size(); - - while (size >= chunk_size) { - update_hashes(mm->as(offset), chunk_size); - mm->release_until(offset); - offset += chunk_size; - size -= chunk_size; + if (mm) { + auto update_hashes = [&](uint8_t const* data, size_t size) { + if (opts.with_similarity) { + sc.update(data, size); } - update_hashes(mm->as(offset), size); + if (opts.with_nilsimsa) { + nc.update(data, size); + } + }; + + constexpr size_t chunk_size = 32 << 20; + size_t offset = 0; + size_t size = mm->size(); + + while (size >= chunk_size) { + update_hashes(mm->as(offset), chunk_size); + mm->release_until(offset); + offset += chunk_size; + size -= chunk_size; } - if (opts.with_similarity) { - similarity_hash_ = sc.finalize(); -#ifndef NDEBUG - similarity_valid_ = true; -#endif - } + update_hashes(mm->as(offset), size); + } - if (opts.with_nilsimsa) { - nc.finalize(nilsimsa_similarity_hash_); + if (opts.with_similarity) { + similarity_hash_ = sc.finalize(); #ifndef NDEBUG - nilsimsa_valid_ = true; + similarity_valid_ = true; +#endif + } + + if (opts.with_nilsimsa) { + nc.finalize(nilsimsa_similarity_hash_); +#ifndef NDEBUG + nilsimsa_valid_ = true; #endif - } } } @@ -190,7 +204,7 @@ class inode_ : public inode { file const* any() const override { if (files_.empty()) { - DWARFS_THROW(runtime_error, "inode has no file"); + DWARFS_THROW(runtime_error, "inode has no file (any)"); } return files_.front(); } @@ -413,17 +427,46 @@ void inode_manager_::order_inodes_by_nilsimsa( index.resize(count); std::iota(index.begin(), index.end(), 0); - auto empty = std::partition(index.begin(), index.end(), - [&](auto i) { return inodes[i]->size() > 0; }); - auto finalize_inode = [&]() { inodes_.push_back(std::move(inodes[index.back()])); index.pop_back(); return fn(inodes_.back()); }; - for (auto n = std::distance(empty, index.end()); n > 0; --n) { - finalize_inode(); + { + auto empty = std::partition(index.begin(), index.end(), + [&](auto i) { return inodes[i]->size() > 0; }); + + if (empty != index.end()) { + auto count = std::distance(empty, index.end()); + + LOG_DEBUG << "finalizing " << count << " empty inodes..."; + + for (auto n = count; n > 0; --n) { + finalize_inode(); + } + } + } + + { + auto unhashed = std::partition(index.begin(), index.end(), [&](auto i) { + auto const& sh = inodes[i]->nilsimsa_similarity_hash(); + return std::any_of(sh.begin(), sh.end(), [](auto v) { return v != 0; }); + }); + + if (unhashed != index.end()) { + auto count = std::distance(unhashed, index.end()); + + std::sort(unhashed, index.end(), [&inodes](auto a, auto b) { + return inodes[a]->size() < inodes[b]->size(); + }); + + LOG_INFO << "finalizing " << count << " unhashed inodes..."; + + for (auto n = count; n > 0; --n) { + finalize_inode(); + } + } } if (!index.empty()) { diff --git a/src/mkdwarfs.cpp b/src/mkdwarfs.cpp index d4f6c797..ebe36258 100644 --- a/src/mkdwarfs.cpp +++ b/src/mkdwarfs.cpp @@ -372,7 +372,7 @@ int mkdwarfs(int argc, char** argv) { std::string path, output, memory_limit, script_arg, compression, header, schema_compression, metadata_compression, log_level_str, timestamp, time_resolution, order, progress_mode, recompress_opts, pack_metadata, - file_hash_algo, debug_filter; + file_hash_algo, debug_filter, max_similarity_size; std::vector filter; size_t num_workers, num_scanner_workers; bool no_progress = false, remove_header = false, no_section_index = false, @@ -475,6 +475,9 @@ int mkdwarfs(int argc, char** argv) { ("order", po::value(&order), order_desc.c_str()) + ("max-similarity-size", + po::value(&max_similarity_size), + "maximum file size to compute similarity") #ifdef DWARFS_HAVE_PYTHON ("script", po::value(&script_arg), @@ -716,6 +719,13 @@ int mkdwarfs(int argc, char** argv) { return 1; } + if (vm.count("max-similarity-size")) { + auto size = parse_size_with_unit(max_similarity_size); + if (size > 0) { + options.inode.max_similarity_scan_size = size; + } + } + size_t mem_limit = parse_size_with_unit(memory_limit); if (!vm.count("num-scanner-workers")) { @@ -1024,7 +1034,8 @@ int mkdwarfs(int argc, char** argv) { << "': " << strerror(errno); return 1; } - } else if (auto oss = dynamic_cast(os.get())) { + } else if (auto oss [[maybe_unused]] = + dynamic_cast(os.get())) { assert(oss->str().empty()); } else { assert(false);