From 148de5bf0d0f8023edd593f736a908b2045ee1d7 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Thu, 27 Oct 2022 09:47:52 +0200 Subject: [PATCH] Add --file-hash option (fixes github #92) This does not yet address the issue that uniquely sized files are unnecessarily hashed, which is also mentioned in #92. This will be addressed separately. --- doc/mkdwarfs.md | 4 ++ include/dwarfs/checksum.h | 27 ++------ include/dwarfs/entry.h | 10 +-- include/dwarfs/fstypes.h | 5 -- include/dwarfs/options.h | 1 + src/dwarfs/checksum.cpp | 139 ++++++++++++++++++++------------------ src/dwarfs/entry.cpp | 41 +++++------ src/dwarfs/fs_section.cpp | 9 +-- src/dwarfs/scanner.cpp | 100 +++++++++++++++++---------- src/mkdwarfs.cpp | 27 +++++++- test/dwarfs.cpp | 48 ++++++++++--- 11 files changed, 246 insertions(+), 165 deletions(-) diff --git a/doc/mkdwarfs.md b/doc/mkdwarfs.md index 6ebaa246..63c16e39 100644 --- a/doc/mkdwarfs.md +++ b/doc/mkdwarfs.md @@ -276,6 +276,10 @@ Most other options are concerned with compression tuning: Don't add a creation timestamp. This is useful when bit-identical file system images are required to be produced from the same input. +- `--file-hash=none`|*name*: + Select the hashing function to be used for file deduplication. If `none` + is chosen, file deduplication is disabled. + - `--log-level=`*name*: Specifiy a logging level. diff --git a/include/dwarfs/checksum.h b/include/dwarfs/checksum.h index cc646c41..5b7234ac 100644 --- a/include/dwarfs/checksum.h +++ b/include/dwarfs/checksum.h @@ -38,27 +38,14 @@ class checksum { XXH3_128, }; - static constexpr size_t digest_size(algorithm alg) { - switch (alg) { - case algorithm::SHA1: - return 20; - case algorithm::SHA2_512_256: - return 32; - case algorithm::XXH3_64: - return 8; - case algorithm::XXH3_128: - return 16; - } - DWARFS_CHECK(false, "unknown algorithm"); - } + static bool is_available(std::string const& algo); + static std::vector available_algorithms(); - static bool - compute(algorithm alg, void const* data, size_t size, void* digest); - - static bool - verify(algorithm alg, void const* data, size_t size, void const* digest); + static bool verify(algorithm alg, void const* data, size_t size, + void const* digest, size_t digest_size); checksum(algorithm alg); + checksum(std::string const& alg); checksum& update(void const* data, size_t size) { impl_->update(data, size); @@ -69,7 +56,7 @@ class checksum { bool verify(void const* digest) const; - algorithm type() const { return alg_; } + size_t digest_size() const { return impl_->digest_size(); } class impl { public: @@ -77,11 +64,11 @@ class checksum { virtual void update(void const* data, size_t size) = 0; virtual bool finalize(void* digest) = 0; + virtual size_t digest_size() = 0; }; private: std::unique_ptr impl_; - algorithm const alg_; }; } // namespace dwarfs diff --git a/include/dwarfs/entry.h b/include/dwarfs/entry.h index 7e89d6bc..d12c507f 100644 --- a/include/dwarfs/entry.h +++ b/include/dwarfs/entry.h @@ -21,7 +21,6 @@ #pragma once -#include #include #include #include @@ -33,6 +32,8 @@ #include +#include + #include "dwarfs/entry_interface.h" namespace dwarfs { @@ -126,7 +127,8 @@ class file : public entry { std::shared_ptr get_inode() const; void accept(entry_visitor& v, bool preorder) override; void scan(os_access& os, progress& prog) override; - void scan(std::shared_ptr const& mm, progress& prog); + void scan(std::shared_ptr const& mm, progress& prog, + std::optional const& hash_alg); void create_data(); void hardlink(file* other, progress& prog); uint32_t unique_file_id() const; @@ -138,8 +140,8 @@ class file : public entry { private: struct data { - using hash_type = std::array; - hash_type hash{0}; + using hash_type = folly::small_vector; + hash_type hash; uint32_t refcount{1}; std::optional inode_num; }; diff --git a/include/dwarfs/fstypes.h b/include/dwarfs/fstypes.h index 7910efbe..512f2f0a 100644 --- a/include/dwarfs/fstypes.h +++ b/include/dwarfs/fstypes.h @@ -108,11 +108,6 @@ struct section_header_v2 { uint16_t compression; // [54] compression uint64_t length; // [56] length of section - static_assert(checksum::digest_size(checksum::algorithm::XXH3_64) == - sizeof(xxh3_64)); - static_assert(checksum::digest_size(checksum::algorithm::SHA2_512_256) == - sizeof(sha2_512_256)); - std::string to_string() const; void dump(std::ostream& os) const; }; diff --git a/include/dwarfs/options.h b/include/dwarfs/options.h index 03132b0b..9d8b83b7 100644 --- a/include/dwarfs/options.h +++ b/include/dwarfs/options.h @@ -87,6 +87,7 @@ struct file_order_options { struct scanner_options { file_order_options file_order; + std::optional file_hash_algorithm{"xxh3-128"}; std::optional uid; std::optional gid; std::optional timestamp; diff --git a/src/dwarfs/checksum.cpp b/src/dwarfs/checksum.cpp index 39e4deda..65ecd743 100644 --- a/src/dwarfs/checksum.cpp +++ b/src/dwarfs/checksum.cpp @@ -19,8 +19,11 @@ * along with dwarfs. If not, see . */ +#include #include #include +#include +#include #include @@ -35,46 +38,29 @@ namespace dwarfs { namespace { -bool compute_evp(const EVP_MD* algorithm, void const* data, size_t size, - void* digest, unsigned int* digest_size) { - return EVP_Digest(data, size, reinterpret_cast(digest), - digest_size, algorithm, nullptr); -} - -bool compute_xxh3_64(void const* data, size_t size, void* digest) { - auto hash = XXH3_64bits(data, size); - static_assert(checksum::digest_size(checksum::algorithm::XXH3_64) == - sizeof(hash)); - ::memcpy(digest, &hash, sizeof(hash)); - return true; -} - -bool compute_xxh3_128(void const* data, size_t size, void* digest) { - auto hash = XXH3_128bits(data, size); - static_assert(checksum::digest_size(checksum::algorithm::XXH3_128) == - sizeof(hash)); - ::memcpy(digest, &hash, sizeof(hash)); - return true; -} +std::unordered_set supported_algorithms{ + "xxh3-64", + "xxh3-128", +}; class checksum_evp : public checksum::impl { public: - checksum_evp(EVP_MD const* evp, checksum::algorithm alg) - : context_(EVP_MD_CTX_new()) - , dig_size_(checksum::digest_size(alg)) { - EVP_DigestInit_ex(context_, evp, nullptr); + checksum_evp(::EVP_MD const* evp) + : context_(::EVP_MD_CTX_new()) + , dig_size_(::EVP_MD_size(evp)) { + ::EVP_DigestInit_ex(context_, evp, nullptr); } - ~checksum_evp() override { EVP_MD_CTX_destroy(context_); } + ~checksum_evp() override { ::EVP_MD_CTX_destroy(context_); } void update(void const* data, size_t size) override { - DWARFS_CHECK(EVP_DigestUpdate(context_, data, size), + DWARFS_CHECK(::EVP_DigestUpdate(context_, data, size), "EVP_DigestUpdate() failed"); } bool finalize(void* digest) override { unsigned int dig_size = 0; - bool rv = EVP_DigestFinal_ex( + bool rv = ::EVP_DigestFinal_ex( context_, reinterpret_cast(digest), &dig_size); if (rv) { @@ -86,8 +72,27 @@ class checksum_evp : public checksum::impl { return rv; } + static std::vector available_algorithms() { + std::vector available; + ::EVP_MD_do_all( + [](const ::EVP_MD*, const char* from, const char* to, void* vec) { + if (!to) { + reinterpret_cast*>(vec)->emplace_back( + from); + } + }, + &available); + return available; + } + + static bool is_available(std::string const& algo) { + return ::EVP_get_digestbyname(algo.c_str()) != nullptr; + } + + size_t digest_size() override { return dig_size_; } + private: - EVP_MD_CTX* context_; + ::EVP_MD_CTX* context_; size_t const dig_size_; }; @@ -113,6 +118,10 @@ class checksum_xxh3_64 : public checksum::impl { return true; } + size_t digest_size() override { + return sizeof(decltype(std::function{XXH3_64bits_digest})::result_type); + } + private: XXH3_state_t* state_; }; @@ -139,57 +148,47 @@ class checksum_xxh3_128 : public checksum::impl { return true; } + size_t digest_size() override { + return sizeof(decltype(std::function{XXH3_128bits_digest})::result_type); + } + private: XXH3_state_t* state_; }; } // namespace -bool checksum::compute(algorithm alg, void const* data, size_t size, - void* digest) { - bool rv = false; - unsigned int dig_size = 0; +bool checksum::is_available(std::string const& algo) { + return supported_algorithms.count(algo) or checksum_evp::is_available(algo); +} - switch (alg) { - case algorithm::SHA1: - rv = compute_evp(EVP_sha1(), data, size, digest, &dig_size); - break; - case algorithm::SHA2_512_256: - rv = compute_evp(EVP_sha512_256(), data, size, digest, &dig_size); - break; - case algorithm::XXH3_64: - rv = compute_xxh3_64(data, size, digest); - break; - case algorithm::XXH3_128: - rv = compute_xxh3_128(data, size, digest); - break; - } - - if (rv && dig_size > 0) { - DWARFS_CHECK(digest_size(alg) == dig_size, - fmt::format("digest size mismatch: {0} != {1} [{2}]", - digest_size(alg), dig_size, - static_cast(alg))); - } - - return rv; +std::vector checksum::available_algorithms() { + auto available_evp = checksum_evp::available_algorithms(); + std::vector available; + available.insert(available.end(), supported_algorithms.begin(), + supported_algorithms.end()); + available.insert(available.end(), available_evp.begin(), available_evp.end()); + std::sort(available.begin(), available.end()); + return available; } bool checksum::verify(algorithm alg, void const* data, size_t size, - const void* digest) { + const void* digest, size_t digest_size) { std::array tmp; - return compute(alg, data, size, tmp.data()) && - ::memcmp(digest, tmp.data(), digest_size(alg)) == 0; + checksum cs(alg); + DWARFS_CHECK(digest_size == cs.digest_size(), "digest size mismatch"); + cs.update(data, size); + return cs.finalize(tmp.data()) && + ::memcmp(digest, tmp.data(), digest_size) == 0; } -checksum::checksum(algorithm alg) - : alg_(alg) { +checksum::checksum(algorithm alg) { switch (alg) { case algorithm::SHA1: - impl_ = std::make_unique(EVP_sha1(), alg); + impl_ = std::make_unique(::EVP_sha1()); break; case algorithm::SHA2_512_256: - impl_ = std::make_unique(EVP_sha512_256(), alg); + impl_ = std::make_unique(::EVP_sha512_256()); break; case algorithm::XXH3_64: impl_ = std::make_unique(); @@ -203,10 +202,22 @@ checksum::checksum(algorithm alg) } } +checksum::checksum(std::string const& alg) { + if (alg == "xxh3-64") { + impl_ = std::make_unique(); + } else if (alg == "xxh3-128") { + impl_ = std::make_unique(); + } else if (auto md = ::EVP_get_digestbyname(alg.c_str())) { + impl_ = std::make_unique(md); + } else { + DWARFS_CHECK(false, "unknown algorithm"); + } +} + bool checksum::verify(void const* digest) const { std::array tmp; return impl_->finalize(tmp.data()) && - ::memcmp(digest, tmp.data(), digest_size(alg_)) == 0; + ::memcmp(digest, tmp.data(), impl_->digest_size()) == 0; } } // namespace dwarfs diff --git a/src/dwarfs/entry.cpp b/src/dwarfs/entry.cpp index 884e0818..c87f0f75 100644 --- a/src/dwarfs/entry.cpp +++ b/src/dwarfs/entry.cpp @@ -142,7 +142,7 @@ void entry::set_ctime(uint64_t ctime) { stat_.st_atime = ctime; } std::string_view file::hash() const { auto& h = data_->hash; - return std::string_view(&h[0], h.size()); + return std::string_view(h.data(), h.size()); } void file::set_inode(std::shared_ptr ino) { @@ -164,32 +164,35 @@ void file::scan(os_access& os, progress& prog) { mm = os.map_file(path(), s); } - scan(mm, prog); + scan(mm, prog, "xxh3-128"); } -void file::scan(std::shared_ptr const& mm, progress& prog) { - constexpr auto alg = checksum::algorithm::XXH3_128; - static_assert(checksum::digest_size(alg) == sizeof(data::hash_type)); +void file::scan(std::shared_ptr const& mm, progress& prog, + std::optional const& hash_alg) { + if (hash_alg) { + checksum cs(*hash_alg); - if (size_t s = size(); s > 0) { - constexpr size_t chunk_size = 32 << 20; - prog.original_size += s; - checksum cs(alg); - size_t offset = 0; + if (size_t s = size(); s > 0) { + constexpr size_t chunk_size = 32 << 20; + prog.original_size += s; + size_t offset = 0; - while (s >= chunk_size) { - cs.update(mm->as(offset), chunk_size); - mm->release_until(offset); - offset += chunk_size; - s -= chunk_size; + while (s >= chunk_size) { + cs.update(mm->as(offset), chunk_size); + mm->release_until(offset); + offset += chunk_size; + s -= chunk_size; + } + + cs.update(mm->as(offset), s); } - cs.update(mm->as(offset), s); + data_->hash.resize(cs.digest_size()); - DWARFS_CHECK(cs.finalize(&data_->hash[0]), "checksum computation failed"); - } else { - DWARFS_CHECK(checksum::compute(alg, nullptr, 0, &data_->hash[0]), + DWARFS_CHECK(cs.finalize(data_->hash.data()), "checksum computation failed"); + } else { + prog.original_size += size(); } } diff --git a/src/dwarfs/fs_section.cpp b/src/dwarfs/fs_section.cpp index 13258ca5..3e6b5fca 100644 --- a/src/dwarfs/fs_section.cpp +++ b/src/dwarfs/fs_section.cpp @@ -122,9 +122,9 @@ class fs_section_v2 : public fs_section::impl { bool check_fast(mmif& mm) const override { auto hdr_cs_len = sizeof(section_header_v2) - offsetof(section_header_v2, number); - return checksum::verify(checksum::algorithm::XXH3_64, - mm.as(start_ - hdr_cs_len), - hdr_.length + hdr_cs_len, &hdr_.xxh3_64); + return checksum::verify( + checksum::algorithm::XXH3_64, mm.as(start_ - hdr_cs_len), + hdr_.length + hdr_cs_len, &hdr_.xxh3_64, sizeof(hdr_.xxh3_64)); } bool verify(mmif& mm) const override { @@ -132,7 +132,8 @@ class fs_section_v2 : public fs_section::impl { sizeof(section_header_v2) - offsetof(section_header_v2, xxh3_64); return checksum::verify(checksum::algorithm::SHA2_512_256, mm.as(start_ - hdr_sha_len), - hdr_.length + hdr_sha_len, &hdr_.sha2_512_256); + hdr_.length + hdr_sha_len, &hdr_.sha2_512_256, + sizeof(hdr_.sha2_512_256)); } folly::ByteRange data(mmif& mm) const override { diff --git a/src/dwarfs/scanner.cpp b/src/dwarfs/scanner.cpp index fa6a8de4..75595b9b 100644 --- a/src/dwarfs/scanner.cpp +++ b/src/dwarfs/scanner.cpp @@ -77,22 +77,23 @@ class visitor_base : public entry_visitor { class file_scanner { public: file_scanner(worker_group& wg, os_access& os, inode_manager& im, - inode_options const& ino_opts, progress& prog) + inode_options const& ino_opts, + std::optional const& hash_algo, progress& prog) : wg_(wg) , os_(os) , im_(im) , ino_opts_(ino_opts) + , hash_algo_{hash_algo} , prog_(prog) {} void scan(file* p) { if (p->num_hard_links() > 1) { - auto ino = p->raw_inode_num(); - auto [it, is_new] = hardlink_cache_.emplace(ino, p); + auto& vec = hardlinks_[p->raw_inode_num()]; + vec.push_back(p); - if (!is_new) { - p->hardlink(it->second, prog_); + if (vec.size() > 1) { + p->hardlink(vec[0], prog_); ++prog_.files_scanned; - hardlinked_.push_back(p); return; } } @@ -108,20 +109,26 @@ class file_scanner { } prog_.current.store(p); - p->scan(mm, prog_); + p->scan(mm, prog_, hash_algo_); ++prog_.files_scanned; std::shared_ptr inode; { std::lock_guard lock(mx_); - auto& ref = hash_[p->hash()]; - if (ref.empty()) { + if (hash_algo_) { + auto& ref = hash_[p->hash()]; + if (ref.empty()) { + inode = im_.create_inode(); + p->set_inode(inode); + } else { + p->set_inode(ref.front()->get_inode()); + } + ref.push_back(p); + } else { + files_[p->raw_inode_num()].push_back(p); inode = im_.create_inode(); p->set_inode(inode); - } else { - p->set_inode(ref.front()->get_inode()); } - ref.push_back(p); } if (inode) { @@ -139,26 +146,50 @@ class file_scanner { } void finalize(uint32_t& inode_num) { - hardlink_cache_.clear(); + if (hash_algo_) { + finalize_hardlinks(hash_, [](file const* p) { return p->hash(); }); + finalize_files(hash_, inode_num); + } else { + finalize_hardlinks(files_, + [](file const* p) { return p->raw_inode_num(); }); + finalize_files(files_, inode_num); + } + } - for (auto p : hardlinked_) { - auto& fv = hash_[p->hash()]; - p->set_inode(fv.front()->get_inode()); - fv.push_back(p); + uint32_t num_unique() const { return num_unique_; } + + private: + template + void finalize_hardlinks(folly::F14FastMap& fmap, + Lookup&& lookup) { + for (auto& kv : hardlinks_) { + auto& hlv = kv.second; + if (hlv.size() > 1) { + auto& fv = fmap[lookup(hlv.front())]; + // TODO: for (auto p : hlv | std::views::drop(1)) { + std::for_each(hlv.begin() + 1, hlv.end(), [&fv](auto p) { + p->set_inode(fv.front()->get_inode()); + fv.push_back(p); + }); + } } - hardlinked_.clear(); + hardlinks_.clear(); + } - std::vector> ent; - ent.reserve(hash_.size()); - hash_.eraseInto(hash_.begin(), hash_.end(), - [&ent](std::string_view&& h, inode::files_vector&& fv) { - ent.emplace_back(std::move(h), std::move(fv)); - }); + template + void finalize_files(folly::F14FastMap& fmap, + uint32_t& inode_num) { + std::vector> ent; + ent.reserve(fmap.size()); + fmap.eraseInto(fmap.begin(), fmap.end(), + [&ent](KeyType&& k, inode::files_vector&& fv) { + ent.emplace_back(std::move(k), std::move(fv)); + }); std::sort(ent.begin(), ent.end(), [](auto& left, auto& right) { return left.first < right.first; }); - DWARFS_CHECK(hash_.empty(), "expected hash to be empty"); + DWARFS_CHECK(fmap.empty(), "expected file map to be empty"); uint32_t obj_num = 0; @@ -166,13 +197,10 @@ class file_scanner { finalize_inodes(ent, inode_num, obj_num); } - uint32_t num_unique() const { return num_unique_; } - - private: - template - void finalize_inodes( - std::vector>& ent, - uint32_t& inode_num, uint32_t& obj_num) { + template + void + finalize_inodes(std::vector>& ent, + uint32_t& inode_num, uint32_t& obj_num) { for (auto& p : ent) { auto& files = p.second; @@ -217,12 +245,13 @@ class file_scanner { os_access& os_; inode_manager& im_; inode_options const& ino_opts_; + std::optional const hash_algo_; progress& prog_; uint32_t num_unique_{0}; - std::vector hardlinked_; - folly::F14FastMap hardlink_cache_; + folly::F14FastMap hardlinks_; std::mutex mx_; folly::F14FastMap hash_; + folly::F14FastMap files_; }; class dir_set_inode_visitor : public visitor_base { @@ -600,7 +629,8 @@ void scanner_::scan(filesystem_writer& fsw, prog.set_status_function(status_string); inode_manager im(lgr_, prog); - file_scanner fs(wg_, *os_, im, options_.inode, prog); + file_scanner fs(wg_, *os_, im, options_.inode, options_.file_hash_algorithm, + prog); auto root = scan_tree(path, prog, fs); diff --git a/src/mkdwarfs.cpp b/src/mkdwarfs.cpp index 7c8d8ddf..d9ba0893 100644 --- a/src/mkdwarfs.cpp +++ b/src/mkdwarfs.cpp @@ -100,13 +100,15 @@ const std::map order_choices{ {"script", file_order_mode::SCRIPT}, #endif {"similarity", file_order_mode::SIMILARITY}, - {"nilsimsa", file_order_mode::NILSIMSA}}; + {"nilsimsa", file_order_mode::NILSIMSA}, +}; const std::map progress_modes{ {"none", console_writer::NONE}, {"simple", console_writer::SIMPLE}, {"ascii", console_writer::ASCII}, - {"unicode", console_writer::UNICODE}}; + {"unicode", console_writer::UNICODE}, +}; const std::map time_resolutions{ {"sec", 1}, @@ -336,7 +338,8 @@ int mkdwarfs(int argc, char** argv) { block_manager::config cfg; std::string path, output, memory_limit, script_arg, compression, header, schema_compression, metadata_compression, log_level_str, timestamp, - time_resolution, order, progress_mode, recompress_opts, pack_metadata; + time_resolution, order, progress_mode, recompress_opts, pack_metadata, + file_hash_algo; size_t num_workers; bool no_progress = false, remove_header = false, no_section_index = false, force_overwrite = false; @@ -355,6 +358,11 @@ int mkdwarfs(int argc, char** argv) { (from(time_resolutions) | get<0>() | unsplit(", ")) + ")"; + auto hash_list = checksum::available_algorithms(); + + auto file_hash_desc = "choice of file hashing function (none, " + + (from(hash_list) | unsplit(", ")) + ")"; + // clang-format off po::options_description opts("Command line options"); opts.add_options() @@ -453,6 +461,9 @@ int mkdwarfs(int argc, char** argv) { ("no-create-timestamp", po::value(&options.no_create_timestamp)->zero_tokens(), "don't add create timestamp to file system") + ("file-hash", + po::value(&file_hash_algo)->default_value("xxh3-128"), + file_hash_desc.c_str()) ("log-level", po::value(&log_level_str)->default_value("info"), "log level (error, warn, info, debug, trace)") @@ -657,6 +668,16 @@ int mkdwarfs(int argc, char** argv) { return 1; } + if (file_hash_algo == "none") { + options.file_hash_algorithm.reset(); + } else if (checksum::is_available(file_hash_algo)) { + options.file_hash_algorithm = file_hash_algo; + } else { + std::cerr << "error: unknown file hash function '" << file_hash_algo + << "'\n"; + return 1; + } + size_t mem_limit = parse_size_with_unit(memory_limit); worker_group wg_compress("compress", num_workers); diff --git a/test/dwarfs.cpp b/test/dwarfs.cpp index cbe30242..f40d49ea 100644 --- a/test/dwarfs.cpp +++ b/test/dwarfs.cpp @@ -50,6 +50,8 @@ using namespace dwarfs; namespace { +std::string const default_file_hash_algo{"xxh3-128"}; + std::string build_dwarfs(logger& lgr, std::shared_ptr input, std::string const& compression, @@ -80,7 +82,8 @@ void basic_end_to_end_test(std::string const& compressor, bool pack_directories, bool pack_shared_files_table, bool pack_names, bool pack_names_index, bool pack_symlinks, bool pack_symlinks_index, - bool plain_names_table, bool plain_symlinks_table) { + bool plain_names_table, bool plain_symlinks_table, + std::optional file_hash_algo) { block_manager::config cfg; scanner_options options; @@ -88,6 +91,7 @@ void basic_end_to_end_test(std::string const& compressor, cfg.block_size_bits = block_size_bits; options.file_order.mode = file_order; + options.file_hash_algorithm = file_hash_algo; options.with_devices = with_devices; options.with_specials = with_specials; options.inode.with_similarity = file_order == file_order_mode::SIMILARITY; @@ -397,8 +401,10 @@ class compression_test std::tuple> {}; class scanner_test : public testing::TestWithParam< - std::tuple> { -}; + std::tuple>> {}; + +class hashing_test : public testing::TestWithParam {}; class packing_test : public testing::TestWithParam< std::tuple> { @@ -417,17 +423,24 @@ TEST_P(compression_test, end_to_end) { basic_end_to_end_test(compressor, block_size_bits, file_order, true, true, false, false, false, false, false, true, true, true, - true, true, true, true, false, false); + true, true, true, true, false, false, + default_file_hash_algo); } TEST_P(scanner_test, end_to_end) { auto [with_devices, with_specials, set_uid, set_gid, set_time, keep_all_times, - enable_nlink] = GetParam(); + enable_nlink, file_hash_algo] = GetParam(); basic_end_to_end_test(compressions[0], 15, file_order_mode::NONE, with_devices, with_specials, set_uid, set_gid, set_time, keep_all_times, enable_nlink, true, true, true, true, - true, true, true, false, false); + true, true, true, false, false, file_hash_algo); +} + +TEST_P(hashing_test, end_to_end) { + basic_end_to_end_test(compressions[0], 15, file_order_mode::NONE, true, true, + true, true, true, true, true, true, true, true, true, + true, true, true, false, false, GetParam()); } TEST_P(packing_test, end_to_end) { @@ -438,7 +451,7 @@ TEST_P(packing_test, end_to_end) { false, false, false, false, false, pack_chunk_table, pack_directories, pack_shared_files_table, pack_names, pack_names_index, pack_symlinks, pack_symlinks_index, - false, false); + false, false, default_file_hash_algo); } TEST_P(plain_tables_test, end_to_end) { @@ -447,7 +460,7 @@ TEST_P(plain_tables_test, end_to_end) { basic_end_to_end_test(compressions[0], 15, file_order_mode::NONE, true, true, false, false, false, false, false, false, false, false, false, false, false, false, plain_names_table, - plain_symlinks_table); + plain_symlinks_table, default_file_hash_algo); } TEST_P(packing_test, regression_empty_fs) { @@ -516,7 +529,11 @@ INSTANTIATE_TEST_SUITE_P( dwarfs, scanner_test, ::testing::Combine(::testing::Bool(), ::testing::Bool(), ::testing::Bool(), ::testing::Bool(), ::testing::Bool(), ::testing::Bool(), - ::testing::Bool())); + ::testing::Bool(), + ::testing::Values(std::nullopt, "xxh3-128", "sha512"))); + +INSTANTIATE_TEST_SUITE_P(dwarfs, hashing_test, + ::testing::ValuesIn(checksum::available_algorithms())); INSTANTIATE_TEST_SUITE_P( dwarfs, packing_test, @@ -661,7 +678,12 @@ TEST_P(compression_regression, github45) { INSTANTIATE_TEST_SUITE_P(dwarfs, compression_regression, ::testing::ValuesIn(compressions)); -TEST(scanner, inode_ordering) { +class file_scanner : public testing::TestWithParam> { +}; + +TEST_P(file_scanner, inode_ordering) { + auto file_hash_algo = GetParam(); + std::ostringstream logss; stream_logger lgr(logss); // TODO: mock lgr.set_policy(); @@ -670,9 +692,10 @@ TEST(scanner, inode_ordering) { auto opts = scanner_options(); opts.file_order.mode = file_order_mode::PATH; + opts.file_hash_algorithm = file_hash_algo; auto input = std::make_shared(); - constexpr int dim = 15; + constexpr int dim = 14; input->add_dir(""); @@ -693,3 +716,6 @@ TEST(scanner, inode_ordering) { EXPECT_EQ(ref, build_dwarfs(lgr, input, "null", bmcfg, opts)); } } + +INSTANTIATE_TEST_SUITE_P(dwarfs, file_scanner, + ::testing::Values(std::nullopt, "xxh3-128"));