mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-17 00:10:03 -04:00
Add --file-hash option (fixes github #92)
This does not yet address the issue that uniquely sized files are unnecessarily hashed, which is also mentioned in #92. This will be addressed separately.
This commit is contained in:
parent
482a40560e
commit
148de5bf0d
@ -276,6 +276,10 @@ Most other options are concerned with compression tuning:
|
||||
Don't add a creation timestamp. This is useful when bit-identical file
|
||||
system images are required to be produced from the same input.
|
||||
|
||||
- `--file-hash=none`|*name*:
|
||||
Select the hashing function to be used for file deduplication. If `none`
|
||||
is chosen, file deduplication is disabled.
|
||||
|
||||
- `--log-level=`*name*:
|
||||
Specifiy a logging level.
|
||||
|
||||
|
@ -38,27 +38,14 @@ class checksum {
|
||||
XXH3_128,
|
||||
};
|
||||
|
||||
static constexpr size_t digest_size(algorithm alg) {
|
||||
switch (alg) {
|
||||
case algorithm::SHA1:
|
||||
return 20;
|
||||
case algorithm::SHA2_512_256:
|
||||
return 32;
|
||||
case algorithm::XXH3_64:
|
||||
return 8;
|
||||
case algorithm::XXH3_128:
|
||||
return 16;
|
||||
}
|
||||
DWARFS_CHECK(false, "unknown algorithm");
|
||||
}
|
||||
static bool is_available(std::string const& algo);
|
||||
static std::vector<std::string> available_algorithms();
|
||||
|
||||
static bool
|
||||
compute(algorithm alg, void const* data, size_t size, void* digest);
|
||||
|
||||
static bool
|
||||
verify(algorithm alg, void const* data, size_t size, void const* digest);
|
||||
static bool verify(algorithm alg, void const* data, size_t size,
|
||||
void const* digest, size_t digest_size);
|
||||
|
||||
checksum(algorithm alg);
|
||||
checksum(std::string const& alg);
|
||||
|
||||
checksum& update(void const* data, size_t size) {
|
||||
impl_->update(data, size);
|
||||
@ -69,7 +56,7 @@ class checksum {
|
||||
|
||||
bool verify(void const* digest) const;
|
||||
|
||||
algorithm type() const { return alg_; }
|
||||
size_t digest_size() const { return impl_->digest_size(); }
|
||||
|
||||
class impl {
|
||||
public:
|
||||
@ -77,11 +64,11 @@ class checksum {
|
||||
|
||||
virtual void update(void const* data, size_t size) = 0;
|
||||
virtual bool finalize(void* digest) = 0;
|
||||
virtual size_t digest_size() = 0;
|
||||
};
|
||||
|
||||
private:
|
||||
std::unique_ptr<impl> impl_;
|
||||
algorithm const alg_;
|
||||
};
|
||||
|
||||
} // namespace dwarfs
|
||||
|
@ -21,7 +21,6 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
@ -33,6 +32,8 @@
|
||||
|
||||
#include <sys/stat.h>
|
||||
|
||||
#include <folly/small_vector.h>
|
||||
|
||||
#include "dwarfs/entry_interface.h"
|
||||
|
||||
namespace dwarfs {
|
||||
@ -126,7 +127,8 @@ class file : public entry {
|
||||
std::shared_ptr<inode> get_inode() const;
|
||||
void accept(entry_visitor& v, bool preorder) override;
|
||||
void scan(os_access& os, progress& prog) override;
|
||||
void scan(std::shared_ptr<mmif> const& mm, progress& prog);
|
||||
void scan(std::shared_ptr<mmif> const& mm, progress& prog,
|
||||
std::optional<std::string> const& hash_alg);
|
||||
void create_data();
|
||||
void hardlink(file* other, progress& prog);
|
||||
uint32_t unique_file_id() const;
|
||||
@ -138,8 +140,8 @@ class file : public entry {
|
||||
|
||||
private:
|
||||
struct data {
|
||||
using hash_type = std::array<char, 16>;
|
||||
hash_type hash{0};
|
||||
using hash_type = folly::small_vector<char, 16>;
|
||||
hash_type hash;
|
||||
uint32_t refcount{1};
|
||||
std::optional<uint32_t> inode_num;
|
||||
};
|
||||
|
@ -108,11 +108,6 @@ struct section_header_v2 {
|
||||
uint16_t compression; // [54] compression
|
||||
uint64_t length; // [56] length of section
|
||||
|
||||
static_assert(checksum::digest_size(checksum::algorithm::XXH3_64) ==
|
||||
sizeof(xxh3_64));
|
||||
static_assert(checksum::digest_size(checksum::algorithm::SHA2_512_256) ==
|
||||
sizeof(sha2_512_256));
|
||||
|
||||
std::string to_string() const;
|
||||
void dump(std::ostream& os) const;
|
||||
};
|
||||
|
@ -87,6 +87,7 @@ struct file_order_options {
|
||||
|
||||
struct scanner_options {
|
||||
file_order_options file_order;
|
||||
std::optional<std::string> file_hash_algorithm{"xxh3-128"};
|
||||
std::optional<uint16_t> uid;
|
||||
std::optional<uint16_t> gid;
|
||||
std::optional<uint64_t> timestamp;
|
||||
|
@ -19,8 +19,11 @@
|
||||
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <cstring>
|
||||
#include <functional>
|
||||
#include <unordered_set>
|
||||
|
||||
#include <openssl/evp.h>
|
||||
|
||||
@ -35,46 +38,29 @@ namespace dwarfs {
|
||||
|
||||
namespace {
|
||||
|
||||
bool compute_evp(const EVP_MD* algorithm, void const* data, size_t size,
|
||||
void* digest, unsigned int* digest_size) {
|
||||
return EVP_Digest(data, size, reinterpret_cast<unsigned char*>(digest),
|
||||
digest_size, algorithm, nullptr);
|
||||
}
|
||||
|
||||
bool compute_xxh3_64(void const* data, size_t size, void* digest) {
|
||||
auto hash = XXH3_64bits(data, size);
|
||||
static_assert(checksum::digest_size(checksum::algorithm::XXH3_64) ==
|
||||
sizeof(hash));
|
||||
::memcpy(digest, &hash, sizeof(hash));
|
||||
return true;
|
||||
}
|
||||
|
||||
bool compute_xxh3_128(void const* data, size_t size, void* digest) {
|
||||
auto hash = XXH3_128bits(data, size);
|
||||
static_assert(checksum::digest_size(checksum::algorithm::XXH3_128) ==
|
||||
sizeof(hash));
|
||||
::memcpy(digest, &hash, sizeof(hash));
|
||||
return true;
|
||||
}
|
||||
std::unordered_set<std::string> supported_algorithms{
|
||||
"xxh3-64",
|
||||
"xxh3-128",
|
||||
};
|
||||
|
||||
class checksum_evp : public checksum::impl {
|
||||
public:
|
||||
checksum_evp(EVP_MD const* evp, checksum::algorithm alg)
|
||||
: context_(EVP_MD_CTX_new())
|
||||
, dig_size_(checksum::digest_size(alg)) {
|
||||
EVP_DigestInit_ex(context_, evp, nullptr);
|
||||
checksum_evp(::EVP_MD const* evp)
|
||||
: context_(::EVP_MD_CTX_new())
|
||||
, dig_size_(::EVP_MD_size(evp)) {
|
||||
::EVP_DigestInit_ex(context_, evp, nullptr);
|
||||
}
|
||||
|
||||
~checksum_evp() override { EVP_MD_CTX_destroy(context_); }
|
||||
~checksum_evp() override { ::EVP_MD_CTX_destroy(context_); }
|
||||
|
||||
void update(void const* data, size_t size) override {
|
||||
DWARFS_CHECK(EVP_DigestUpdate(context_, data, size),
|
||||
DWARFS_CHECK(::EVP_DigestUpdate(context_, data, size),
|
||||
"EVP_DigestUpdate() failed");
|
||||
}
|
||||
|
||||
bool finalize(void* digest) override {
|
||||
unsigned int dig_size = 0;
|
||||
bool rv = EVP_DigestFinal_ex(
|
||||
bool rv = ::EVP_DigestFinal_ex(
|
||||
context_, reinterpret_cast<unsigned char*>(digest), &dig_size);
|
||||
|
||||
if (rv) {
|
||||
@ -86,8 +72,27 @@ class checksum_evp : public checksum::impl {
|
||||
return rv;
|
||||
}
|
||||
|
||||
static std::vector<std::string> available_algorithms() {
|
||||
std::vector<std::string> available;
|
||||
::EVP_MD_do_all(
|
||||
[](const ::EVP_MD*, const char* from, const char* to, void* vec) {
|
||||
if (!to) {
|
||||
reinterpret_cast<std::vector<std::string>*>(vec)->emplace_back(
|
||||
from);
|
||||
}
|
||||
},
|
||||
&available);
|
||||
return available;
|
||||
}
|
||||
|
||||
static bool is_available(std::string const& algo) {
|
||||
return ::EVP_get_digestbyname(algo.c_str()) != nullptr;
|
||||
}
|
||||
|
||||
size_t digest_size() override { return dig_size_; }
|
||||
|
||||
private:
|
||||
EVP_MD_CTX* context_;
|
||||
::EVP_MD_CTX* context_;
|
||||
size_t const dig_size_;
|
||||
};
|
||||
|
||||
@ -113,6 +118,10 @@ class checksum_xxh3_64 : public checksum::impl {
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t digest_size() override {
|
||||
return sizeof(decltype(std::function{XXH3_64bits_digest})::result_type);
|
||||
}
|
||||
|
||||
private:
|
||||
XXH3_state_t* state_;
|
||||
};
|
||||
@ -139,57 +148,47 @@ class checksum_xxh3_128 : public checksum::impl {
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t digest_size() override {
|
||||
return sizeof(decltype(std::function{XXH3_128bits_digest})::result_type);
|
||||
}
|
||||
|
||||
private:
|
||||
XXH3_state_t* state_;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
bool checksum::compute(algorithm alg, void const* data, size_t size,
|
||||
void* digest) {
|
||||
bool rv = false;
|
||||
unsigned int dig_size = 0;
|
||||
bool checksum::is_available(std::string const& algo) {
|
||||
return supported_algorithms.count(algo) or checksum_evp::is_available(algo);
|
||||
}
|
||||
|
||||
switch (alg) {
|
||||
case algorithm::SHA1:
|
||||
rv = compute_evp(EVP_sha1(), data, size, digest, &dig_size);
|
||||
break;
|
||||
case algorithm::SHA2_512_256:
|
||||
rv = compute_evp(EVP_sha512_256(), data, size, digest, &dig_size);
|
||||
break;
|
||||
case algorithm::XXH3_64:
|
||||
rv = compute_xxh3_64(data, size, digest);
|
||||
break;
|
||||
case algorithm::XXH3_128:
|
||||
rv = compute_xxh3_128(data, size, digest);
|
||||
break;
|
||||
}
|
||||
|
||||
if (rv && dig_size > 0) {
|
||||
DWARFS_CHECK(digest_size(alg) == dig_size,
|
||||
fmt::format("digest size mismatch: {0} != {1} [{2}]",
|
||||
digest_size(alg), dig_size,
|
||||
static_cast<int>(alg)));
|
||||
}
|
||||
|
||||
return rv;
|
||||
std::vector<std::string> checksum::available_algorithms() {
|
||||
auto available_evp = checksum_evp::available_algorithms();
|
||||
std::vector<std::string> available;
|
||||
available.insert(available.end(), supported_algorithms.begin(),
|
||||
supported_algorithms.end());
|
||||
available.insert(available.end(), available_evp.begin(), available_evp.end());
|
||||
std::sort(available.begin(), available.end());
|
||||
return available;
|
||||
}
|
||||
|
||||
bool checksum::verify(algorithm alg, void const* data, size_t size,
|
||||
const void* digest) {
|
||||
const void* digest, size_t digest_size) {
|
||||
std::array<char, EVP_MAX_MD_SIZE> tmp;
|
||||
return compute(alg, data, size, tmp.data()) &&
|
||||
::memcmp(digest, tmp.data(), digest_size(alg)) == 0;
|
||||
checksum cs(alg);
|
||||
DWARFS_CHECK(digest_size == cs.digest_size(), "digest size mismatch");
|
||||
cs.update(data, size);
|
||||
return cs.finalize(tmp.data()) &&
|
||||
::memcmp(digest, tmp.data(), digest_size) == 0;
|
||||
}
|
||||
|
||||
checksum::checksum(algorithm alg)
|
||||
: alg_(alg) {
|
||||
checksum::checksum(algorithm alg) {
|
||||
switch (alg) {
|
||||
case algorithm::SHA1:
|
||||
impl_ = std::make_unique<checksum_evp>(EVP_sha1(), alg);
|
||||
impl_ = std::make_unique<checksum_evp>(::EVP_sha1());
|
||||
break;
|
||||
case algorithm::SHA2_512_256:
|
||||
impl_ = std::make_unique<checksum_evp>(EVP_sha512_256(), alg);
|
||||
impl_ = std::make_unique<checksum_evp>(::EVP_sha512_256());
|
||||
break;
|
||||
case algorithm::XXH3_64:
|
||||
impl_ = std::make_unique<checksum_xxh3_64>();
|
||||
@ -203,10 +202,22 @@ checksum::checksum(algorithm alg)
|
||||
}
|
||||
}
|
||||
|
||||
checksum::checksum(std::string const& alg) {
|
||||
if (alg == "xxh3-64") {
|
||||
impl_ = std::make_unique<checksum_xxh3_64>();
|
||||
} else if (alg == "xxh3-128") {
|
||||
impl_ = std::make_unique<checksum_xxh3_128>();
|
||||
} else if (auto md = ::EVP_get_digestbyname(alg.c_str())) {
|
||||
impl_ = std::make_unique<checksum_evp>(md);
|
||||
} else {
|
||||
DWARFS_CHECK(false, "unknown algorithm");
|
||||
}
|
||||
}
|
||||
|
||||
bool checksum::verify(void const* digest) const {
|
||||
std::array<char, EVP_MAX_MD_SIZE> tmp;
|
||||
return impl_->finalize(tmp.data()) &&
|
||||
::memcmp(digest, tmp.data(), digest_size(alg_)) == 0;
|
||||
::memcmp(digest, tmp.data(), impl_->digest_size()) == 0;
|
||||
}
|
||||
|
||||
} // namespace dwarfs
|
||||
|
@ -142,7 +142,7 @@ void entry::set_ctime(uint64_t ctime) { stat_.st_atime = ctime; }
|
||||
|
||||
std::string_view file::hash() const {
|
||||
auto& h = data_->hash;
|
||||
return std::string_view(&h[0], h.size());
|
||||
return std::string_view(h.data(), h.size());
|
||||
}
|
||||
|
||||
void file::set_inode(std::shared_ptr<inode> ino) {
|
||||
@ -164,32 +164,35 @@ void file::scan(os_access& os, progress& prog) {
|
||||
mm = os.map_file(path(), s);
|
||||
}
|
||||
|
||||
scan(mm, prog);
|
||||
scan(mm, prog, "xxh3-128");
|
||||
}
|
||||
|
||||
void file::scan(std::shared_ptr<mmif> const& mm, progress& prog) {
|
||||
constexpr auto alg = checksum::algorithm::XXH3_128;
|
||||
static_assert(checksum::digest_size(alg) == sizeof(data::hash_type));
|
||||
void file::scan(std::shared_ptr<mmif> const& mm, progress& prog,
|
||||
std::optional<std::string> const& hash_alg) {
|
||||
if (hash_alg) {
|
||||
checksum cs(*hash_alg);
|
||||
|
||||
if (size_t s = size(); s > 0) {
|
||||
constexpr size_t chunk_size = 32 << 20;
|
||||
prog.original_size += s;
|
||||
checksum cs(alg);
|
||||
size_t offset = 0;
|
||||
if (size_t s = size(); s > 0) {
|
||||
constexpr size_t chunk_size = 32 << 20;
|
||||
prog.original_size += s;
|
||||
size_t offset = 0;
|
||||
|
||||
while (s >= chunk_size) {
|
||||
cs.update(mm->as<void>(offset), chunk_size);
|
||||
mm->release_until(offset);
|
||||
offset += chunk_size;
|
||||
s -= chunk_size;
|
||||
while (s >= chunk_size) {
|
||||
cs.update(mm->as<void>(offset), chunk_size);
|
||||
mm->release_until(offset);
|
||||
offset += chunk_size;
|
||||
s -= chunk_size;
|
||||
}
|
||||
|
||||
cs.update(mm->as<void>(offset), s);
|
||||
}
|
||||
|
||||
cs.update(mm->as<void>(offset), s);
|
||||
data_->hash.resize(cs.digest_size());
|
||||
|
||||
DWARFS_CHECK(cs.finalize(&data_->hash[0]), "checksum computation failed");
|
||||
} else {
|
||||
DWARFS_CHECK(checksum::compute(alg, nullptr, 0, &data_->hash[0]),
|
||||
DWARFS_CHECK(cs.finalize(data_->hash.data()),
|
||||
"checksum computation failed");
|
||||
} else {
|
||||
prog.original_size += size();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -122,9 +122,9 @@ class fs_section_v2 : public fs_section::impl {
|
||||
bool check_fast(mmif& mm) const override {
|
||||
auto hdr_cs_len =
|
||||
sizeof(section_header_v2) - offsetof(section_header_v2, number);
|
||||
return checksum::verify(checksum::algorithm::XXH3_64,
|
||||
mm.as<void>(start_ - hdr_cs_len),
|
||||
hdr_.length + hdr_cs_len, &hdr_.xxh3_64);
|
||||
return checksum::verify(
|
||||
checksum::algorithm::XXH3_64, mm.as<void>(start_ - hdr_cs_len),
|
||||
hdr_.length + hdr_cs_len, &hdr_.xxh3_64, sizeof(hdr_.xxh3_64));
|
||||
}
|
||||
|
||||
bool verify(mmif& mm) const override {
|
||||
@ -132,7 +132,8 @@ class fs_section_v2 : public fs_section::impl {
|
||||
sizeof(section_header_v2) - offsetof(section_header_v2, xxh3_64);
|
||||
return checksum::verify(checksum::algorithm::SHA2_512_256,
|
||||
mm.as<void>(start_ - hdr_sha_len),
|
||||
hdr_.length + hdr_sha_len, &hdr_.sha2_512_256);
|
||||
hdr_.length + hdr_sha_len, &hdr_.sha2_512_256,
|
||||
sizeof(hdr_.sha2_512_256));
|
||||
}
|
||||
|
||||
folly::ByteRange data(mmif& mm) const override {
|
||||
|
@ -77,22 +77,23 @@ class visitor_base : public entry_visitor {
|
||||
class file_scanner {
|
||||
public:
|
||||
file_scanner(worker_group& wg, os_access& os, inode_manager& im,
|
||||
inode_options const& ino_opts, progress& prog)
|
||||
inode_options const& ino_opts,
|
||||
std::optional<std::string> const& hash_algo, progress& prog)
|
||||
: wg_(wg)
|
||||
, os_(os)
|
||||
, im_(im)
|
||||
, ino_opts_(ino_opts)
|
||||
, hash_algo_{hash_algo}
|
||||
, prog_(prog) {}
|
||||
|
||||
void scan(file* p) {
|
||||
if (p->num_hard_links() > 1) {
|
||||
auto ino = p->raw_inode_num();
|
||||
auto [it, is_new] = hardlink_cache_.emplace(ino, p);
|
||||
auto& vec = hardlinks_[p->raw_inode_num()];
|
||||
vec.push_back(p);
|
||||
|
||||
if (!is_new) {
|
||||
p->hardlink(it->second, prog_);
|
||||
if (vec.size() > 1) {
|
||||
p->hardlink(vec[0], prog_);
|
||||
++prog_.files_scanned;
|
||||
hardlinked_.push_back(p);
|
||||
return;
|
||||
}
|
||||
}
|
||||
@ -108,20 +109,26 @@ class file_scanner {
|
||||
}
|
||||
|
||||
prog_.current.store(p);
|
||||
p->scan(mm, prog_);
|
||||
p->scan(mm, prog_, hash_algo_);
|
||||
++prog_.files_scanned;
|
||||
std::shared_ptr<inode> inode;
|
||||
|
||||
{
|
||||
std::lock_guard lock(mx_);
|
||||
auto& ref = hash_[p->hash()];
|
||||
if (ref.empty()) {
|
||||
if (hash_algo_) {
|
||||
auto& ref = hash_[p->hash()];
|
||||
if (ref.empty()) {
|
||||
inode = im_.create_inode();
|
||||
p->set_inode(inode);
|
||||
} else {
|
||||
p->set_inode(ref.front()->get_inode());
|
||||
}
|
||||
ref.push_back(p);
|
||||
} else {
|
||||
files_[p->raw_inode_num()].push_back(p);
|
||||
inode = im_.create_inode();
|
||||
p->set_inode(inode);
|
||||
} else {
|
||||
p->set_inode(ref.front()->get_inode());
|
||||
}
|
||||
ref.push_back(p);
|
||||
}
|
||||
|
||||
if (inode) {
|
||||
@ -139,26 +146,50 @@ class file_scanner {
|
||||
}
|
||||
|
||||
void finalize(uint32_t& inode_num) {
|
||||
hardlink_cache_.clear();
|
||||
if (hash_algo_) {
|
||||
finalize_hardlinks(hash_, [](file const* p) { return p->hash(); });
|
||||
finalize_files(hash_, inode_num);
|
||||
} else {
|
||||
finalize_hardlinks(files_,
|
||||
[](file const* p) { return p->raw_inode_num(); });
|
||||
finalize_files(files_, inode_num);
|
||||
}
|
||||
}
|
||||
|
||||
for (auto p : hardlinked_) {
|
||||
auto& fv = hash_[p->hash()];
|
||||
p->set_inode(fv.front()->get_inode());
|
||||
fv.push_back(p);
|
||||
uint32_t num_unique() const { return num_unique_; }
|
||||
|
||||
private:
|
||||
template <typename KeyType, typename Lookup>
|
||||
void finalize_hardlinks(folly::F14FastMap<KeyType, inode::files_vector>& fmap,
|
||||
Lookup&& lookup) {
|
||||
for (auto& kv : hardlinks_) {
|
||||
auto& hlv = kv.second;
|
||||
if (hlv.size() > 1) {
|
||||
auto& fv = fmap[lookup(hlv.front())];
|
||||
// TODO: for (auto p : hlv | std::views::drop(1)) {
|
||||
std::for_each(hlv.begin() + 1, hlv.end(), [&fv](auto p) {
|
||||
p->set_inode(fv.front()->get_inode());
|
||||
fv.push_back(p);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
hardlinked_.clear();
|
||||
hardlinks_.clear();
|
||||
}
|
||||
|
||||
std::vector<std::pair<std::string_view, inode::files_vector>> ent;
|
||||
ent.reserve(hash_.size());
|
||||
hash_.eraseInto(hash_.begin(), hash_.end(),
|
||||
[&ent](std::string_view&& h, inode::files_vector&& fv) {
|
||||
ent.emplace_back(std::move(h), std::move(fv));
|
||||
});
|
||||
template <typename KeyType>
|
||||
void finalize_files(folly::F14FastMap<KeyType, inode::files_vector>& fmap,
|
||||
uint32_t& inode_num) {
|
||||
std::vector<std::pair<KeyType, inode::files_vector>> ent;
|
||||
ent.reserve(fmap.size());
|
||||
fmap.eraseInto(fmap.begin(), fmap.end(),
|
||||
[&ent](KeyType&& k, inode::files_vector&& fv) {
|
||||
ent.emplace_back(std::move(k), std::move(fv));
|
||||
});
|
||||
std::sort(ent.begin(), ent.end(),
|
||||
[](auto& left, auto& right) { return left.first < right.first; });
|
||||
|
||||
DWARFS_CHECK(hash_.empty(), "expected hash to be empty");
|
||||
DWARFS_CHECK(fmap.empty(), "expected file map to be empty");
|
||||
|
||||
uint32_t obj_num = 0;
|
||||
|
||||
@ -166,13 +197,10 @@ class file_scanner {
|
||||
finalize_inodes<false>(ent, inode_num, obj_num);
|
||||
}
|
||||
|
||||
uint32_t num_unique() const { return num_unique_; }
|
||||
|
||||
private:
|
||||
template <bool Unique>
|
||||
void finalize_inodes(
|
||||
std::vector<std::pair<std::string_view, inode::files_vector>>& ent,
|
||||
uint32_t& inode_num, uint32_t& obj_num) {
|
||||
template <bool Unique, typename KeyType>
|
||||
void
|
||||
finalize_inodes(std::vector<std::pair<KeyType, inode::files_vector>>& ent,
|
||||
uint32_t& inode_num, uint32_t& obj_num) {
|
||||
for (auto& p : ent) {
|
||||
auto& files = p.second;
|
||||
|
||||
@ -217,12 +245,13 @@ class file_scanner {
|
||||
os_access& os_;
|
||||
inode_manager& im_;
|
||||
inode_options const& ino_opts_;
|
||||
std::optional<std::string> const hash_algo_;
|
||||
progress& prog_;
|
||||
uint32_t num_unique_{0};
|
||||
std::vector<file*> hardlinked_;
|
||||
folly::F14FastMap<uint64_t, file*> hardlink_cache_;
|
||||
folly::F14FastMap<uint64_t, inode::files_vector> hardlinks_;
|
||||
std::mutex mx_;
|
||||
folly::F14FastMap<std::string_view, inode::files_vector> hash_;
|
||||
folly::F14FastMap<uint64_t, inode::files_vector> files_;
|
||||
};
|
||||
|
||||
class dir_set_inode_visitor : public visitor_base {
|
||||
@ -600,7 +629,8 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
||||
prog.set_status_function(status_string);
|
||||
|
||||
inode_manager im(lgr_, prog);
|
||||
file_scanner fs(wg_, *os_, im, options_.inode, prog);
|
||||
file_scanner fs(wg_, *os_, im, options_.inode, options_.file_hash_algorithm,
|
||||
prog);
|
||||
|
||||
auto root = scan_tree(path, prog, fs);
|
||||
|
||||
|
@ -100,13 +100,15 @@ const std::map<std::string, file_order_mode> order_choices{
|
||||
{"script", file_order_mode::SCRIPT},
|
||||
#endif
|
||||
{"similarity", file_order_mode::SIMILARITY},
|
||||
{"nilsimsa", file_order_mode::NILSIMSA}};
|
||||
{"nilsimsa", file_order_mode::NILSIMSA},
|
||||
};
|
||||
|
||||
const std::map<std::string, console_writer::progress_mode> progress_modes{
|
||||
{"none", console_writer::NONE},
|
||||
{"simple", console_writer::SIMPLE},
|
||||
{"ascii", console_writer::ASCII},
|
||||
{"unicode", console_writer::UNICODE}};
|
||||
{"unicode", console_writer::UNICODE},
|
||||
};
|
||||
|
||||
const std::map<std::string, uint32_t> time_resolutions{
|
||||
{"sec", 1},
|
||||
@ -336,7 +338,8 @@ int mkdwarfs(int argc, char** argv) {
|
||||
block_manager::config cfg;
|
||||
std::string path, output, memory_limit, script_arg, compression, header,
|
||||
schema_compression, metadata_compression, log_level_str, timestamp,
|
||||
time_resolution, order, progress_mode, recompress_opts, pack_metadata;
|
||||
time_resolution, order, progress_mode, recompress_opts, pack_metadata,
|
||||
file_hash_algo;
|
||||
size_t num_workers;
|
||||
bool no_progress = false, remove_header = false, no_section_index = false,
|
||||
force_overwrite = false;
|
||||
@ -355,6 +358,11 @@ int mkdwarfs(int argc, char** argv) {
|
||||
(from(time_resolutions) | get<0>() | unsplit(", ")) +
|
||||
")";
|
||||
|
||||
auto hash_list = checksum::available_algorithms();
|
||||
|
||||
auto file_hash_desc = "choice of file hashing function (none, " +
|
||||
(from(hash_list) | unsplit(", ")) + ")";
|
||||
|
||||
// clang-format off
|
||||
po::options_description opts("Command line options");
|
||||
opts.add_options()
|
||||
@ -453,6 +461,9 @@ int mkdwarfs(int argc, char** argv) {
|
||||
("no-create-timestamp",
|
||||
po::value<bool>(&options.no_create_timestamp)->zero_tokens(),
|
||||
"don't add create timestamp to file system")
|
||||
("file-hash",
|
||||
po::value<std::string>(&file_hash_algo)->default_value("xxh3-128"),
|
||||
file_hash_desc.c_str())
|
||||
("log-level",
|
||||
po::value<std::string>(&log_level_str)->default_value("info"),
|
||||
"log level (error, warn, info, debug, trace)")
|
||||
@ -657,6 +668,16 @@ int mkdwarfs(int argc, char** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (file_hash_algo == "none") {
|
||||
options.file_hash_algorithm.reset();
|
||||
} else if (checksum::is_available(file_hash_algo)) {
|
||||
options.file_hash_algorithm = file_hash_algo;
|
||||
} else {
|
||||
std::cerr << "error: unknown file hash function '" << file_hash_algo
|
||||
<< "'\n";
|
||||
return 1;
|
||||
}
|
||||
|
||||
size_t mem_limit = parse_size_with_unit(memory_limit);
|
||||
|
||||
worker_group wg_compress("compress", num_workers);
|
||||
|
@ -50,6 +50,8 @@ using namespace dwarfs;
|
||||
|
||||
namespace {
|
||||
|
||||
std::string const default_file_hash_algo{"xxh3-128"};
|
||||
|
||||
std::string
|
||||
build_dwarfs(logger& lgr, std::shared_ptr<test::os_access_mock> input,
|
||||
std::string const& compression,
|
||||
@ -80,7 +82,8 @@ void basic_end_to_end_test(std::string const& compressor,
|
||||
bool pack_directories, bool pack_shared_files_table,
|
||||
bool pack_names, bool pack_names_index,
|
||||
bool pack_symlinks, bool pack_symlinks_index,
|
||||
bool plain_names_table, bool plain_symlinks_table) {
|
||||
bool plain_names_table, bool plain_symlinks_table,
|
||||
std::optional<std::string> file_hash_algo) {
|
||||
block_manager::config cfg;
|
||||
scanner_options options;
|
||||
|
||||
@ -88,6 +91,7 @@ void basic_end_to_end_test(std::string const& compressor,
|
||||
cfg.block_size_bits = block_size_bits;
|
||||
|
||||
options.file_order.mode = file_order;
|
||||
options.file_hash_algorithm = file_hash_algo;
|
||||
options.with_devices = with_devices;
|
||||
options.with_specials = with_specials;
|
||||
options.inode.with_similarity = file_order == file_order_mode::SIMILARITY;
|
||||
@ -397,8 +401,10 @@ class compression_test
|
||||
std::tuple<std::string, unsigned, file_order_mode>> {};
|
||||
|
||||
class scanner_test : public testing::TestWithParam<
|
||||
std::tuple<bool, bool, bool, bool, bool, bool, bool>> {
|
||||
};
|
||||
std::tuple<bool, bool, bool, bool, bool, bool, bool,
|
||||
std::optional<std::string>>> {};
|
||||
|
||||
class hashing_test : public testing::TestWithParam<std::string> {};
|
||||
|
||||
class packing_test : public testing::TestWithParam<
|
||||
std::tuple<bool, bool, bool, bool, bool, bool, bool>> {
|
||||
@ -417,17 +423,24 @@ TEST_P(compression_test, end_to_end) {
|
||||
|
||||
basic_end_to_end_test(compressor, block_size_bits, file_order, true, true,
|
||||
false, false, false, false, false, true, true, true,
|
||||
true, true, true, true, false, false);
|
||||
true, true, true, true, false, false,
|
||||
default_file_hash_algo);
|
||||
}
|
||||
|
||||
TEST_P(scanner_test, end_to_end) {
|
||||
auto [with_devices, with_specials, set_uid, set_gid, set_time, keep_all_times,
|
||||
enable_nlink] = GetParam();
|
||||
enable_nlink, file_hash_algo] = GetParam();
|
||||
|
||||
basic_end_to_end_test(compressions[0], 15, file_order_mode::NONE,
|
||||
with_devices, with_specials, set_uid, set_gid, set_time,
|
||||
keep_all_times, enable_nlink, true, true, true, true,
|
||||
true, true, true, false, false);
|
||||
true, true, true, false, false, file_hash_algo);
|
||||
}
|
||||
|
||||
TEST_P(hashing_test, end_to_end) {
|
||||
basic_end_to_end_test(compressions[0], 15, file_order_mode::NONE, true, true,
|
||||
true, true, true, true, true, true, true, true, true,
|
||||
true, true, true, false, false, GetParam());
|
||||
}
|
||||
|
||||
TEST_P(packing_test, end_to_end) {
|
||||
@ -438,7 +451,7 @@ TEST_P(packing_test, end_to_end) {
|
||||
false, false, false, false, false, pack_chunk_table,
|
||||
pack_directories, pack_shared_files_table, pack_names,
|
||||
pack_names_index, pack_symlinks, pack_symlinks_index,
|
||||
false, false);
|
||||
false, false, default_file_hash_algo);
|
||||
}
|
||||
|
||||
TEST_P(plain_tables_test, end_to_end) {
|
||||
@ -447,7 +460,7 @@ TEST_P(plain_tables_test, end_to_end) {
|
||||
basic_end_to_end_test(compressions[0], 15, file_order_mode::NONE, true, true,
|
||||
false, false, false, false, false, false, false, false,
|
||||
false, false, false, false, plain_names_table,
|
||||
plain_symlinks_table);
|
||||
plain_symlinks_table, default_file_hash_algo);
|
||||
}
|
||||
|
||||
TEST_P(packing_test, regression_empty_fs) {
|
||||
@ -516,7 +529,11 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
dwarfs, scanner_test,
|
||||
::testing::Combine(::testing::Bool(), ::testing::Bool(), ::testing::Bool(),
|
||||
::testing::Bool(), ::testing::Bool(), ::testing::Bool(),
|
||||
::testing::Bool()));
|
||||
::testing::Bool(),
|
||||
::testing::Values(std::nullopt, "xxh3-128", "sha512")));
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(dwarfs, hashing_test,
|
||||
::testing::ValuesIn(checksum::available_algorithms()));
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
dwarfs, packing_test,
|
||||
@ -661,7 +678,12 @@ TEST_P(compression_regression, github45) {
|
||||
INSTANTIATE_TEST_SUITE_P(dwarfs, compression_regression,
|
||||
::testing::ValuesIn(compressions));
|
||||
|
||||
TEST(scanner, inode_ordering) {
|
||||
class file_scanner : public testing::TestWithParam<std::optional<std::string>> {
|
||||
};
|
||||
|
||||
TEST_P(file_scanner, inode_ordering) {
|
||||
auto file_hash_algo = GetParam();
|
||||
|
||||
std::ostringstream logss;
|
||||
stream_logger lgr(logss); // TODO: mock
|
||||
lgr.set_policy<prod_logger_policy>();
|
||||
@ -670,9 +692,10 @@ TEST(scanner, inode_ordering) {
|
||||
auto opts = scanner_options();
|
||||
|
||||
opts.file_order.mode = file_order_mode::PATH;
|
||||
opts.file_hash_algorithm = file_hash_algo;
|
||||
|
||||
auto input = std::make_shared<test::os_access_mock>();
|
||||
constexpr int dim = 15;
|
||||
constexpr int dim = 14;
|
||||
|
||||
input->add_dir("");
|
||||
|
||||
@ -693,3 +716,6 @@ TEST(scanner, inode_ordering) {
|
||||
EXPECT_EQ(ref, build_dwarfs(lgr, input, "null", bmcfg, opts));
|
||||
}
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(dwarfs, file_scanner,
|
||||
::testing::Values(std::nullopt, "xxh3-128"));
|
||||
|
Loading…
x
Reference in New Issue
Block a user