mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-18 08:49:29 -04:00
Add --file-hash option (fixes github #92)
This does not yet address the issue that uniquely sized files are unnecessarily hashed, which is also mentioned in #92. This will be addressed separately.
This commit is contained in:
parent
482a40560e
commit
148de5bf0d
@ -276,6 +276,10 @@ Most other options are concerned with compression tuning:
|
|||||||
Don't add a creation timestamp. This is useful when bit-identical file
|
Don't add a creation timestamp. This is useful when bit-identical file
|
||||||
system images are required to be produced from the same input.
|
system images are required to be produced from the same input.
|
||||||
|
|
||||||
|
- `--file-hash=none`|*name*:
|
||||||
|
Select the hashing function to be used for file deduplication. If `none`
|
||||||
|
is chosen, file deduplication is disabled.
|
||||||
|
|
||||||
- `--log-level=`*name*:
|
- `--log-level=`*name*:
|
||||||
Specifiy a logging level.
|
Specifiy a logging level.
|
||||||
|
|
||||||
|
@ -38,27 +38,14 @@ class checksum {
|
|||||||
XXH3_128,
|
XXH3_128,
|
||||||
};
|
};
|
||||||
|
|
||||||
static constexpr size_t digest_size(algorithm alg) {
|
static bool is_available(std::string const& algo);
|
||||||
switch (alg) {
|
static std::vector<std::string> available_algorithms();
|
||||||
case algorithm::SHA1:
|
|
||||||
return 20;
|
|
||||||
case algorithm::SHA2_512_256:
|
|
||||||
return 32;
|
|
||||||
case algorithm::XXH3_64:
|
|
||||||
return 8;
|
|
||||||
case algorithm::XXH3_128:
|
|
||||||
return 16;
|
|
||||||
}
|
|
||||||
DWARFS_CHECK(false, "unknown algorithm");
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool
|
static bool verify(algorithm alg, void const* data, size_t size,
|
||||||
compute(algorithm alg, void const* data, size_t size, void* digest);
|
void const* digest, size_t digest_size);
|
||||||
|
|
||||||
static bool
|
|
||||||
verify(algorithm alg, void const* data, size_t size, void const* digest);
|
|
||||||
|
|
||||||
checksum(algorithm alg);
|
checksum(algorithm alg);
|
||||||
|
checksum(std::string const& alg);
|
||||||
|
|
||||||
checksum& update(void const* data, size_t size) {
|
checksum& update(void const* data, size_t size) {
|
||||||
impl_->update(data, size);
|
impl_->update(data, size);
|
||||||
@ -69,7 +56,7 @@ class checksum {
|
|||||||
|
|
||||||
bool verify(void const* digest) const;
|
bool verify(void const* digest) const;
|
||||||
|
|
||||||
algorithm type() const { return alg_; }
|
size_t digest_size() const { return impl_->digest_size(); }
|
||||||
|
|
||||||
class impl {
|
class impl {
|
||||||
public:
|
public:
|
||||||
@ -77,11 +64,11 @@ class checksum {
|
|||||||
|
|
||||||
virtual void update(void const* data, size_t size) = 0;
|
virtual void update(void const* data, size_t size) = 0;
|
||||||
virtual bool finalize(void* digest) = 0;
|
virtual bool finalize(void* digest) = 0;
|
||||||
|
virtual size_t digest_size() = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::unique_ptr<impl> impl_;
|
std::unique_ptr<impl> impl_;
|
||||||
algorithm const alg_;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace dwarfs
|
} // namespace dwarfs
|
||||||
|
@ -21,7 +21,6 @@
|
|||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <array>
|
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
@ -33,6 +32,8 @@
|
|||||||
|
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
|
|
||||||
|
#include <folly/small_vector.h>
|
||||||
|
|
||||||
#include "dwarfs/entry_interface.h"
|
#include "dwarfs/entry_interface.h"
|
||||||
|
|
||||||
namespace dwarfs {
|
namespace dwarfs {
|
||||||
@ -126,7 +127,8 @@ class file : public entry {
|
|||||||
std::shared_ptr<inode> get_inode() const;
|
std::shared_ptr<inode> get_inode() const;
|
||||||
void accept(entry_visitor& v, bool preorder) override;
|
void accept(entry_visitor& v, bool preorder) override;
|
||||||
void scan(os_access& os, progress& prog) override;
|
void scan(os_access& os, progress& prog) override;
|
||||||
void scan(std::shared_ptr<mmif> const& mm, progress& prog);
|
void scan(std::shared_ptr<mmif> const& mm, progress& prog,
|
||||||
|
std::optional<std::string> const& hash_alg);
|
||||||
void create_data();
|
void create_data();
|
||||||
void hardlink(file* other, progress& prog);
|
void hardlink(file* other, progress& prog);
|
||||||
uint32_t unique_file_id() const;
|
uint32_t unique_file_id() const;
|
||||||
@ -138,8 +140,8 @@ class file : public entry {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
struct data {
|
struct data {
|
||||||
using hash_type = std::array<char, 16>;
|
using hash_type = folly::small_vector<char, 16>;
|
||||||
hash_type hash{0};
|
hash_type hash;
|
||||||
uint32_t refcount{1};
|
uint32_t refcount{1};
|
||||||
std::optional<uint32_t> inode_num;
|
std::optional<uint32_t> inode_num;
|
||||||
};
|
};
|
||||||
|
@ -108,11 +108,6 @@ struct section_header_v2 {
|
|||||||
uint16_t compression; // [54] compression
|
uint16_t compression; // [54] compression
|
||||||
uint64_t length; // [56] length of section
|
uint64_t length; // [56] length of section
|
||||||
|
|
||||||
static_assert(checksum::digest_size(checksum::algorithm::XXH3_64) ==
|
|
||||||
sizeof(xxh3_64));
|
|
||||||
static_assert(checksum::digest_size(checksum::algorithm::SHA2_512_256) ==
|
|
||||||
sizeof(sha2_512_256));
|
|
||||||
|
|
||||||
std::string to_string() const;
|
std::string to_string() const;
|
||||||
void dump(std::ostream& os) const;
|
void dump(std::ostream& os) const;
|
||||||
};
|
};
|
||||||
|
@ -87,6 +87,7 @@ struct file_order_options {
|
|||||||
|
|
||||||
struct scanner_options {
|
struct scanner_options {
|
||||||
file_order_options file_order;
|
file_order_options file_order;
|
||||||
|
std::optional<std::string> file_hash_algorithm{"xxh3-128"};
|
||||||
std::optional<uint16_t> uid;
|
std::optional<uint16_t> uid;
|
||||||
std::optional<uint16_t> gid;
|
std::optional<uint16_t> gid;
|
||||||
std::optional<uint64_t> timestamp;
|
std::optional<uint64_t> timestamp;
|
||||||
|
@ -19,8 +19,11 @@
|
|||||||
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
|
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
#include <array>
|
#include <array>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
#include <functional>
|
||||||
|
#include <unordered_set>
|
||||||
|
|
||||||
#include <openssl/evp.h>
|
#include <openssl/evp.h>
|
||||||
|
|
||||||
@ -35,46 +38,29 @@ namespace dwarfs {
|
|||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
bool compute_evp(const EVP_MD* algorithm, void const* data, size_t size,
|
std::unordered_set<std::string> supported_algorithms{
|
||||||
void* digest, unsigned int* digest_size) {
|
"xxh3-64",
|
||||||
return EVP_Digest(data, size, reinterpret_cast<unsigned char*>(digest),
|
"xxh3-128",
|
||||||
digest_size, algorithm, nullptr);
|
};
|
||||||
}
|
|
||||||
|
|
||||||
bool compute_xxh3_64(void const* data, size_t size, void* digest) {
|
|
||||||
auto hash = XXH3_64bits(data, size);
|
|
||||||
static_assert(checksum::digest_size(checksum::algorithm::XXH3_64) ==
|
|
||||||
sizeof(hash));
|
|
||||||
::memcpy(digest, &hash, sizeof(hash));
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool compute_xxh3_128(void const* data, size_t size, void* digest) {
|
|
||||||
auto hash = XXH3_128bits(data, size);
|
|
||||||
static_assert(checksum::digest_size(checksum::algorithm::XXH3_128) ==
|
|
||||||
sizeof(hash));
|
|
||||||
::memcpy(digest, &hash, sizeof(hash));
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
class checksum_evp : public checksum::impl {
|
class checksum_evp : public checksum::impl {
|
||||||
public:
|
public:
|
||||||
checksum_evp(EVP_MD const* evp, checksum::algorithm alg)
|
checksum_evp(::EVP_MD const* evp)
|
||||||
: context_(EVP_MD_CTX_new())
|
: context_(::EVP_MD_CTX_new())
|
||||||
, dig_size_(checksum::digest_size(alg)) {
|
, dig_size_(::EVP_MD_size(evp)) {
|
||||||
EVP_DigestInit_ex(context_, evp, nullptr);
|
::EVP_DigestInit_ex(context_, evp, nullptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
~checksum_evp() override { EVP_MD_CTX_destroy(context_); }
|
~checksum_evp() override { ::EVP_MD_CTX_destroy(context_); }
|
||||||
|
|
||||||
void update(void const* data, size_t size) override {
|
void update(void const* data, size_t size) override {
|
||||||
DWARFS_CHECK(EVP_DigestUpdate(context_, data, size),
|
DWARFS_CHECK(::EVP_DigestUpdate(context_, data, size),
|
||||||
"EVP_DigestUpdate() failed");
|
"EVP_DigestUpdate() failed");
|
||||||
}
|
}
|
||||||
|
|
||||||
bool finalize(void* digest) override {
|
bool finalize(void* digest) override {
|
||||||
unsigned int dig_size = 0;
|
unsigned int dig_size = 0;
|
||||||
bool rv = EVP_DigestFinal_ex(
|
bool rv = ::EVP_DigestFinal_ex(
|
||||||
context_, reinterpret_cast<unsigned char*>(digest), &dig_size);
|
context_, reinterpret_cast<unsigned char*>(digest), &dig_size);
|
||||||
|
|
||||||
if (rv) {
|
if (rv) {
|
||||||
@ -86,8 +72,27 @@ class checksum_evp : public checksum::impl {
|
|||||||
return rv;
|
return rv;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::vector<std::string> available_algorithms() {
|
||||||
|
std::vector<std::string> available;
|
||||||
|
::EVP_MD_do_all(
|
||||||
|
[](const ::EVP_MD*, const char* from, const char* to, void* vec) {
|
||||||
|
if (!to) {
|
||||||
|
reinterpret_cast<std::vector<std::string>*>(vec)->emplace_back(
|
||||||
|
from);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
&available);
|
||||||
|
return available;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool is_available(std::string const& algo) {
|
||||||
|
return ::EVP_get_digestbyname(algo.c_str()) != nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t digest_size() override { return dig_size_; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
EVP_MD_CTX* context_;
|
::EVP_MD_CTX* context_;
|
||||||
size_t const dig_size_;
|
size_t const dig_size_;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -113,6 +118,10 @@ class checksum_xxh3_64 : public checksum::impl {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t digest_size() override {
|
||||||
|
return sizeof(decltype(std::function{XXH3_64bits_digest})::result_type);
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
XXH3_state_t* state_;
|
XXH3_state_t* state_;
|
||||||
};
|
};
|
||||||
@ -139,57 +148,47 @@ class checksum_xxh3_128 : public checksum::impl {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t digest_size() override {
|
||||||
|
return sizeof(decltype(std::function{XXH3_128bits_digest})::result_type);
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
XXH3_state_t* state_;
|
XXH3_state_t* state_;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
bool checksum::compute(algorithm alg, void const* data, size_t size,
|
bool checksum::is_available(std::string const& algo) {
|
||||||
void* digest) {
|
return supported_algorithms.count(algo) or checksum_evp::is_available(algo);
|
||||||
bool rv = false;
|
|
||||||
unsigned int dig_size = 0;
|
|
||||||
|
|
||||||
switch (alg) {
|
|
||||||
case algorithm::SHA1:
|
|
||||||
rv = compute_evp(EVP_sha1(), data, size, digest, &dig_size);
|
|
||||||
break;
|
|
||||||
case algorithm::SHA2_512_256:
|
|
||||||
rv = compute_evp(EVP_sha512_256(), data, size, digest, &dig_size);
|
|
||||||
break;
|
|
||||||
case algorithm::XXH3_64:
|
|
||||||
rv = compute_xxh3_64(data, size, digest);
|
|
||||||
break;
|
|
||||||
case algorithm::XXH3_128:
|
|
||||||
rv = compute_xxh3_128(data, size, digest);
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (rv && dig_size > 0) {
|
std::vector<std::string> checksum::available_algorithms() {
|
||||||
DWARFS_CHECK(digest_size(alg) == dig_size,
|
auto available_evp = checksum_evp::available_algorithms();
|
||||||
fmt::format("digest size mismatch: {0} != {1} [{2}]",
|
std::vector<std::string> available;
|
||||||
digest_size(alg), dig_size,
|
available.insert(available.end(), supported_algorithms.begin(),
|
||||||
static_cast<int>(alg)));
|
supported_algorithms.end());
|
||||||
}
|
available.insert(available.end(), available_evp.begin(), available_evp.end());
|
||||||
|
std::sort(available.begin(), available.end());
|
||||||
return rv;
|
return available;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool checksum::verify(algorithm alg, void const* data, size_t size,
|
bool checksum::verify(algorithm alg, void const* data, size_t size,
|
||||||
const void* digest) {
|
const void* digest, size_t digest_size) {
|
||||||
std::array<char, EVP_MAX_MD_SIZE> tmp;
|
std::array<char, EVP_MAX_MD_SIZE> tmp;
|
||||||
return compute(alg, data, size, tmp.data()) &&
|
checksum cs(alg);
|
||||||
::memcmp(digest, tmp.data(), digest_size(alg)) == 0;
|
DWARFS_CHECK(digest_size == cs.digest_size(), "digest size mismatch");
|
||||||
|
cs.update(data, size);
|
||||||
|
return cs.finalize(tmp.data()) &&
|
||||||
|
::memcmp(digest, tmp.data(), digest_size) == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
checksum::checksum(algorithm alg)
|
checksum::checksum(algorithm alg) {
|
||||||
: alg_(alg) {
|
|
||||||
switch (alg) {
|
switch (alg) {
|
||||||
case algorithm::SHA1:
|
case algorithm::SHA1:
|
||||||
impl_ = std::make_unique<checksum_evp>(EVP_sha1(), alg);
|
impl_ = std::make_unique<checksum_evp>(::EVP_sha1());
|
||||||
break;
|
break;
|
||||||
case algorithm::SHA2_512_256:
|
case algorithm::SHA2_512_256:
|
||||||
impl_ = std::make_unique<checksum_evp>(EVP_sha512_256(), alg);
|
impl_ = std::make_unique<checksum_evp>(::EVP_sha512_256());
|
||||||
break;
|
break;
|
||||||
case algorithm::XXH3_64:
|
case algorithm::XXH3_64:
|
||||||
impl_ = std::make_unique<checksum_xxh3_64>();
|
impl_ = std::make_unique<checksum_xxh3_64>();
|
||||||
@ -203,10 +202,22 @@ checksum::checksum(algorithm alg)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
checksum::checksum(std::string const& alg) {
|
||||||
|
if (alg == "xxh3-64") {
|
||||||
|
impl_ = std::make_unique<checksum_xxh3_64>();
|
||||||
|
} else if (alg == "xxh3-128") {
|
||||||
|
impl_ = std::make_unique<checksum_xxh3_128>();
|
||||||
|
} else if (auto md = ::EVP_get_digestbyname(alg.c_str())) {
|
||||||
|
impl_ = std::make_unique<checksum_evp>(md);
|
||||||
|
} else {
|
||||||
|
DWARFS_CHECK(false, "unknown algorithm");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
bool checksum::verify(void const* digest) const {
|
bool checksum::verify(void const* digest) const {
|
||||||
std::array<char, EVP_MAX_MD_SIZE> tmp;
|
std::array<char, EVP_MAX_MD_SIZE> tmp;
|
||||||
return impl_->finalize(tmp.data()) &&
|
return impl_->finalize(tmp.data()) &&
|
||||||
::memcmp(digest, tmp.data(), digest_size(alg_)) == 0;
|
::memcmp(digest, tmp.data(), impl_->digest_size()) == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace dwarfs
|
} // namespace dwarfs
|
||||||
|
@ -142,7 +142,7 @@ void entry::set_ctime(uint64_t ctime) { stat_.st_atime = ctime; }
|
|||||||
|
|
||||||
std::string_view file::hash() const {
|
std::string_view file::hash() const {
|
||||||
auto& h = data_->hash;
|
auto& h = data_->hash;
|
||||||
return std::string_view(&h[0], h.size());
|
return std::string_view(h.data(), h.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
void file::set_inode(std::shared_ptr<inode> ino) {
|
void file::set_inode(std::shared_ptr<inode> ino) {
|
||||||
@ -164,17 +164,17 @@ void file::scan(os_access& os, progress& prog) {
|
|||||||
mm = os.map_file(path(), s);
|
mm = os.map_file(path(), s);
|
||||||
}
|
}
|
||||||
|
|
||||||
scan(mm, prog);
|
scan(mm, prog, "xxh3-128");
|
||||||
}
|
}
|
||||||
|
|
||||||
void file::scan(std::shared_ptr<mmif> const& mm, progress& prog) {
|
void file::scan(std::shared_ptr<mmif> const& mm, progress& prog,
|
||||||
constexpr auto alg = checksum::algorithm::XXH3_128;
|
std::optional<std::string> const& hash_alg) {
|
||||||
static_assert(checksum::digest_size(alg) == sizeof(data::hash_type));
|
if (hash_alg) {
|
||||||
|
checksum cs(*hash_alg);
|
||||||
|
|
||||||
if (size_t s = size(); s > 0) {
|
if (size_t s = size(); s > 0) {
|
||||||
constexpr size_t chunk_size = 32 << 20;
|
constexpr size_t chunk_size = 32 << 20;
|
||||||
prog.original_size += s;
|
prog.original_size += s;
|
||||||
checksum cs(alg);
|
|
||||||
size_t offset = 0;
|
size_t offset = 0;
|
||||||
|
|
||||||
while (s >= chunk_size) {
|
while (s >= chunk_size) {
|
||||||
@ -185,11 +185,14 @@ void file::scan(std::shared_ptr<mmif> const& mm, progress& prog) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
cs.update(mm->as<void>(offset), s);
|
cs.update(mm->as<void>(offset), s);
|
||||||
|
}
|
||||||
|
|
||||||
DWARFS_CHECK(cs.finalize(&data_->hash[0]), "checksum computation failed");
|
data_->hash.resize(cs.digest_size());
|
||||||
} else {
|
|
||||||
DWARFS_CHECK(checksum::compute(alg, nullptr, 0, &data_->hash[0]),
|
DWARFS_CHECK(cs.finalize(data_->hash.data()),
|
||||||
"checksum computation failed");
|
"checksum computation failed");
|
||||||
|
} else {
|
||||||
|
prog.original_size += size();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -122,9 +122,9 @@ class fs_section_v2 : public fs_section::impl {
|
|||||||
bool check_fast(mmif& mm) const override {
|
bool check_fast(mmif& mm) const override {
|
||||||
auto hdr_cs_len =
|
auto hdr_cs_len =
|
||||||
sizeof(section_header_v2) - offsetof(section_header_v2, number);
|
sizeof(section_header_v2) - offsetof(section_header_v2, number);
|
||||||
return checksum::verify(checksum::algorithm::XXH3_64,
|
return checksum::verify(
|
||||||
mm.as<void>(start_ - hdr_cs_len),
|
checksum::algorithm::XXH3_64, mm.as<void>(start_ - hdr_cs_len),
|
||||||
hdr_.length + hdr_cs_len, &hdr_.xxh3_64);
|
hdr_.length + hdr_cs_len, &hdr_.xxh3_64, sizeof(hdr_.xxh3_64));
|
||||||
}
|
}
|
||||||
|
|
||||||
bool verify(mmif& mm) const override {
|
bool verify(mmif& mm) const override {
|
||||||
@ -132,7 +132,8 @@ class fs_section_v2 : public fs_section::impl {
|
|||||||
sizeof(section_header_v2) - offsetof(section_header_v2, xxh3_64);
|
sizeof(section_header_v2) - offsetof(section_header_v2, xxh3_64);
|
||||||
return checksum::verify(checksum::algorithm::SHA2_512_256,
|
return checksum::verify(checksum::algorithm::SHA2_512_256,
|
||||||
mm.as<void>(start_ - hdr_sha_len),
|
mm.as<void>(start_ - hdr_sha_len),
|
||||||
hdr_.length + hdr_sha_len, &hdr_.sha2_512_256);
|
hdr_.length + hdr_sha_len, &hdr_.sha2_512_256,
|
||||||
|
sizeof(hdr_.sha2_512_256));
|
||||||
}
|
}
|
||||||
|
|
||||||
folly::ByteRange data(mmif& mm) const override {
|
folly::ByteRange data(mmif& mm) const override {
|
||||||
|
@ -77,22 +77,23 @@ class visitor_base : public entry_visitor {
|
|||||||
class file_scanner {
|
class file_scanner {
|
||||||
public:
|
public:
|
||||||
file_scanner(worker_group& wg, os_access& os, inode_manager& im,
|
file_scanner(worker_group& wg, os_access& os, inode_manager& im,
|
||||||
inode_options const& ino_opts, progress& prog)
|
inode_options const& ino_opts,
|
||||||
|
std::optional<std::string> const& hash_algo, progress& prog)
|
||||||
: wg_(wg)
|
: wg_(wg)
|
||||||
, os_(os)
|
, os_(os)
|
||||||
, im_(im)
|
, im_(im)
|
||||||
, ino_opts_(ino_opts)
|
, ino_opts_(ino_opts)
|
||||||
|
, hash_algo_{hash_algo}
|
||||||
, prog_(prog) {}
|
, prog_(prog) {}
|
||||||
|
|
||||||
void scan(file* p) {
|
void scan(file* p) {
|
||||||
if (p->num_hard_links() > 1) {
|
if (p->num_hard_links() > 1) {
|
||||||
auto ino = p->raw_inode_num();
|
auto& vec = hardlinks_[p->raw_inode_num()];
|
||||||
auto [it, is_new] = hardlink_cache_.emplace(ino, p);
|
vec.push_back(p);
|
||||||
|
|
||||||
if (!is_new) {
|
if (vec.size() > 1) {
|
||||||
p->hardlink(it->second, prog_);
|
p->hardlink(vec[0], prog_);
|
||||||
++prog_.files_scanned;
|
++prog_.files_scanned;
|
||||||
hardlinked_.push_back(p);
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -108,12 +109,13 @@ class file_scanner {
|
|||||||
}
|
}
|
||||||
|
|
||||||
prog_.current.store(p);
|
prog_.current.store(p);
|
||||||
p->scan(mm, prog_);
|
p->scan(mm, prog_, hash_algo_);
|
||||||
++prog_.files_scanned;
|
++prog_.files_scanned;
|
||||||
std::shared_ptr<inode> inode;
|
std::shared_ptr<inode> inode;
|
||||||
|
|
||||||
{
|
{
|
||||||
std::lock_guard lock(mx_);
|
std::lock_guard lock(mx_);
|
||||||
|
if (hash_algo_) {
|
||||||
auto& ref = hash_[p->hash()];
|
auto& ref = hash_[p->hash()];
|
||||||
if (ref.empty()) {
|
if (ref.empty()) {
|
||||||
inode = im_.create_inode();
|
inode = im_.create_inode();
|
||||||
@ -122,6 +124,11 @@ class file_scanner {
|
|||||||
p->set_inode(ref.front()->get_inode());
|
p->set_inode(ref.front()->get_inode());
|
||||||
}
|
}
|
||||||
ref.push_back(p);
|
ref.push_back(p);
|
||||||
|
} else {
|
||||||
|
files_[p->raw_inode_num()].push_back(p);
|
||||||
|
inode = im_.create_inode();
|
||||||
|
p->set_inode(inode);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (inode) {
|
if (inode) {
|
||||||
@ -139,26 +146,50 @@ class file_scanner {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void finalize(uint32_t& inode_num) {
|
void finalize(uint32_t& inode_num) {
|
||||||
hardlink_cache_.clear();
|
if (hash_algo_) {
|
||||||
|
finalize_hardlinks(hash_, [](file const* p) { return p->hash(); });
|
||||||
for (auto p : hardlinked_) {
|
finalize_files(hash_, inode_num);
|
||||||
auto& fv = hash_[p->hash()];
|
} else {
|
||||||
p->set_inode(fv.front()->get_inode());
|
finalize_hardlinks(files_,
|
||||||
fv.push_back(p);
|
[](file const* p) { return p->raw_inode_num(); });
|
||||||
|
finalize_files(files_, inode_num);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
hardlinked_.clear();
|
uint32_t num_unique() const { return num_unique_; }
|
||||||
|
|
||||||
std::vector<std::pair<std::string_view, inode::files_vector>> ent;
|
private:
|
||||||
ent.reserve(hash_.size());
|
template <typename KeyType, typename Lookup>
|
||||||
hash_.eraseInto(hash_.begin(), hash_.end(),
|
void finalize_hardlinks(folly::F14FastMap<KeyType, inode::files_vector>& fmap,
|
||||||
[&ent](std::string_view&& h, inode::files_vector&& fv) {
|
Lookup&& lookup) {
|
||||||
ent.emplace_back(std::move(h), std::move(fv));
|
for (auto& kv : hardlinks_) {
|
||||||
|
auto& hlv = kv.second;
|
||||||
|
if (hlv.size() > 1) {
|
||||||
|
auto& fv = fmap[lookup(hlv.front())];
|
||||||
|
// TODO: for (auto p : hlv | std::views::drop(1)) {
|
||||||
|
std::for_each(hlv.begin() + 1, hlv.end(), [&fv](auto p) {
|
||||||
|
p->set_inode(fv.front()->get_inode());
|
||||||
|
fv.push_back(p);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
hardlinks_.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename KeyType>
|
||||||
|
void finalize_files(folly::F14FastMap<KeyType, inode::files_vector>& fmap,
|
||||||
|
uint32_t& inode_num) {
|
||||||
|
std::vector<std::pair<KeyType, inode::files_vector>> ent;
|
||||||
|
ent.reserve(fmap.size());
|
||||||
|
fmap.eraseInto(fmap.begin(), fmap.end(),
|
||||||
|
[&ent](KeyType&& k, inode::files_vector&& fv) {
|
||||||
|
ent.emplace_back(std::move(k), std::move(fv));
|
||||||
});
|
});
|
||||||
std::sort(ent.begin(), ent.end(),
|
std::sort(ent.begin(), ent.end(),
|
||||||
[](auto& left, auto& right) { return left.first < right.first; });
|
[](auto& left, auto& right) { return left.first < right.first; });
|
||||||
|
|
||||||
DWARFS_CHECK(hash_.empty(), "expected hash to be empty");
|
DWARFS_CHECK(fmap.empty(), "expected file map to be empty");
|
||||||
|
|
||||||
uint32_t obj_num = 0;
|
uint32_t obj_num = 0;
|
||||||
|
|
||||||
@ -166,12 +197,9 @@ class file_scanner {
|
|||||||
finalize_inodes<false>(ent, inode_num, obj_num);
|
finalize_inodes<false>(ent, inode_num, obj_num);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t num_unique() const { return num_unique_; }
|
template <bool Unique, typename KeyType>
|
||||||
|
void
|
||||||
private:
|
finalize_inodes(std::vector<std::pair<KeyType, inode::files_vector>>& ent,
|
||||||
template <bool Unique>
|
|
||||||
void finalize_inodes(
|
|
||||||
std::vector<std::pair<std::string_view, inode::files_vector>>& ent,
|
|
||||||
uint32_t& inode_num, uint32_t& obj_num) {
|
uint32_t& inode_num, uint32_t& obj_num) {
|
||||||
for (auto& p : ent) {
|
for (auto& p : ent) {
|
||||||
auto& files = p.second;
|
auto& files = p.second;
|
||||||
@ -217,12 +245,13 @@ class file_scanner {
|
|||||||
os_access& os_;
|
os_access& os_;
|
||||||
inode_manager& im_;
|
inode_manager& im_;
|
||||||
inode_options const& ino_opts_;
|
inode_options const& ino_opts_;
|
||||||
|
std::optional<std::string> const hash_algo_;
|
||||||
progress& prog_;
|
progress& prog_;
|
||||||
uint32_t num_unique_{0};
|
uint32_t num_unique_{0};
|
||||||
std::vector<file*> hardlinked_;
|
folly::F14FastMap<uint64_t, inode::files_vector> hardlinks_;
|
||||||
folly::F14FastMap<uint64_t, file*> hardlink_cache_;
|
|
||||||
std::mutex mx_;
|
std::mutex mx_;
|
||||||
folly::F14FastMap<std::string_view, inode::files_vector> hash_;
|
folly::F14FastMap<std::string_view, inode::files_vector> hash_;
|
||||||
|
folly::F14FastMap<uint64_t, inode::files_vector> files_;
|
||||||
};
|
};
|
||||||
|
|
||||||
class dir_set_inode_visitor : public visitor_base {
|
class dir_set_inode_visitor : public visitor_base {
|
||||||
@ -600,7 +629,8 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
|||||||
prog.set_status_function(status_string);
|
prog.set_status_function(status_string);
|
||||||
|
|
||||||
inode_manager im(lgr_, prog);
|
inode_manager im(lgr_, prog);
|
||||||
file_scanner fs(wg_, *os_, im, options_.inode, prog);
|
file_scanner fs(wg_, *os_, im, options_.inode, options_.file_hash_algorithm,
|
||||||
|
prog);
|
||||||
|
|
||||||
auto root = scan_tree(path, prog, fs);
|
auto root = scan_tree(path, prog, fs);
|
||||||
|
|
||||||
|
@ -100,13 +100,15 @@ const std::map<std::string, file_order_mode> order_choices{
|
|||||||
{"script", file_order_mode::SCRIPT},
|
{"script", file_order_mode::SCRIPT},
|
||||||
#endif
|
#endif
|
||||||
{"similarity", file_order_mode::SIMILARITY},
|
{"similarity", file_order_mode::SIMILARITY},
|
||||||
{"nilsimsa", file_order_mode::NILSIMSA}};
|
{"nilsimsa", file_order_mode::NILSIMSA},
|
||||||
|
};
|
||||||
|
|
||||||
const std::map<std::string, console_writer::progress_mode> progress_modes{
|
const std::map<std::string, console_writer::progress_mode> progress_modes{
|
||||||
{"none", console_writer::NONE},
|
{"none", console_writer::NONE},
|
||||||
{"simple", console_writer::SIMPLE},
|
{"simple", console_writer::SIMPLE},
|
||||||
{"ascii", console_writer::ASCII},
|
{"ascii", console_writer::ASCII},
|
||||||
{"unicode", console_writer::UNICODE}};
|
{"unicode", console_writer::UNICODE},
|
||||||
|
};
|
||||||
|
|
||||||
const std::map<std::string, uint32_t> time_resolutions{
|
const std::map<std::string, uint32_t> time_resolutions{
|
||||||
{"sec", 1},
|
{"sec", 1},
|
||||||
@ -336,7 +338,8 @@ int mkdwarfs(int argc, char** argv) {
|
|||||||
block_manager::config cfg;
|
block_manager::config cfg;
|
||||||
std::string path, output, memory_limit, script_arg, compression, header,
|
std::string path, output, memory_limit, script_arg, compression, header,
|
||||||
schema_compression, metadata_compression, log_level_str, timestamp,
|
schema_compression, metadata_compression, log_level_str, timestamp,
|
||||||
time_resolution, order, progress_mode, recompress_opts, pack_metadata;
|
time_resolution, order, progress_mode, recompress_opts, pack_metadata,
|
||||||
|
file_hash_algo;
|
||||||
size_t num_workers;
|
size_t num_workers;
|
||||||
bool no_progress = false, remove_header = false, no_section_index = false,
|
bool no_progress = false, remove_header = false, no_section_index = false,
|
||||||
force_overwrite = false;
|
force_overwrite = false;
|
||||||
@ -355,6 +358,11 @@ int mkdwarfs(int argc, char** argv) {
|
|||||||
(from(time_resolutions) | get<0>() | unsplit(", ")) +
|
(from(time_resolutions) | get<0>() | unsplit(", ")) +
|
||||||
")";
|
")";
|
||||||
|
|
||||||
|
auto hash_list = checksum::available_algorithms();
|
||||||
|
|
||||||
|
auto file_hash_desc = "choice of file hashing function (none, " +
|
||||||
|
(from(hash_list) | unsplit(", ")) + ")";
|
||||||
|
|
||||||
// clang-format off
|
// clang-format off
|
||||||
po::options_description opts("Command line options");
|
po::options_description opts("Command line options");
|
||||||
opts.add_options()
|
opts.add_options()
|
||||||
@ -453,6 +461,9 @@ int mkdwarfs(int argc, char** argv) {
|
|||||||
("no-create-timestamp",
|
("no-create-timestamp",
|
||||||
po::value<bool>(&options.no_create_timestamp)->zero_tokens(),
|
po::value<bool>(&options.no_create_timestamp)->zero_tokens(),
|
||||||
"don't add create timestamp to file system")
|
"don't add create timestamp to file system")
|
||||||
|
("file-hash",
|
||||||
|
po::value<std::string>(&file_hash_algo)->default_value("xxh3-128"),
|
||||||
|
file_hash_desc.c_str())
|
||||||
("log-level",
|
("log-level",
|
||||||
po::value<std::string>(&log_level_str)->default_value("info"),
|
po::value<std::string>(&log_level_str)->default_value("info"),
|
||||||
"log level (error, warn, info, debug, trace)")
|
"log level (error, warn, info, debug, trace)")
|
||||||
@ -657,6 +668,16 @@ int mkdwarfs(int argc, char** argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (file_hash_algo == "none") {
|
||||||
|
options.file_hash_algorithm.reset();
|
||||||
|
} else if (checksum::is_available(file_hash_algo)) {
|
||||||
|
options.file_hash_algorithm = file_hash_algo;
|
||||||
|
} else {
|
||||||
|
std::cerr << "error: unknown file hash function '" << file_hash_algo
|
||||||
|
<< "'\n";
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
size_t mem_limit = parse_size_with_unit(memory_limit);
|
size_t mem_limit = parse_size_with_unit(memory_limit);
|
||||||
|
|
||||||
worker_group wg_compress("compress", num_workers);
|
worker_group wg_compress("compress", num_workers);
|
||||||
|
@ -50,6 +50,8 @@ using namespace dwarfs;
|
|||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
|
std::string const default_file_hash_algo{"xxh3-128"};
|
||||||
|
|
||||||
std::string
|
std::string
|
||||||
build_dwarfs(logger& lgr, std::shared_ptr<test::os_access_mock> input,
|
build_dwarfs(logger& lgr, std::shared_ptr<test::os_access_mock> input,
|
||||||
std::string const& compression,
|
std::string const& compression,
|
||||||
@ -80,7 +82,8 @@ void basic_end_to_end_test(std::string const& compressor,
|
|||||||
bool pack_directories, bool pack_shared_files_table,
|
bool pack_directories, bool pack_shared_files_table,
|
||||||
bool pack_names, bool pack_names_index,
|
bool pack_names, bool pack_names_index,
|
||||||
bool pack_symlinks, bool pack_symlinks_index,
|
bool pack_symlinks, bool pack_symlinks_index,
|
||||||
bool plain_names_table, bool plain_symlinks_table) {
|
bool plain_names_table, bool plain_symlinks_table,
|
||||||
|
std::optional<std::string> file_hash_algo) {
|
||||||
block_manager::config cfg;
|
block_manager::config cfg;
|
||||||
scanner_options options;
|
scanner_options options;
|
||||||
|
|
||||||
@ -88,6 +91,7 @@ void basic_end_to_end_test(std::string const& compressor,
|
|||||||
cfg.block_size_bits = block_size_bits;
|
cfg.block_size_bits = block_size_bits;
|
||||||
|
|
||||||
options.file_order.mode = file_order;
|
options.file_order.mode = file_order;
|
||||||
|
options.file_hash_algorithm = file_hash_algo;
|
||||||
options.with_devices = with_devices;
|
options.with_devices = with_devices;
|
||||||
options.with_specials = with_specials;
|
options.with_specials = with_specials;
|
||||||
options.inode.with_similarity = file_order == file_order_mode::SIMILARITY;
|
options.inode.with_similarity = file_order == file_order_mode::SIMILARITY;
|
||||||
@ -397,8 +401,10 @@ class compression_test
|
|||||||
std::tuple<std::string, unsigned, file_order_mode>> {};
|
std::tuple<std::string, unsigned, file_order_mode>> {};
|
||||||
|
|
||||||
class scanner_test : public testing::TestWithParam<
|
class scanner_test : public testing::TestWithParam<
|
||||||
std::tuple<bool, bool, bool, bool, bool, bool, bool>> {
|
std::tuple<bool, bool, bool, bool, bool, bool, bool,
|
||||||
};
|
std::optional<std::string>>> {};
|
||||||
|
|
||||||
|
class hashing_test : public testing::TestWithParam<std::string> {};
|
||||||
|
|
||||||
class packing_test : public testing::TestWithParam<
|
class packing_test : public testing::TestWithParam<
|
||||||
std::tuple<bool, bool, bool, bool, bool, bool, bool>> {
|
std::tuple<bool, bool, bool, bool, bool, bool, bool>> {
|
||||||
@ -417,17 +423,24 @@ TEST_P(compression_test, end_to_end) {
|
|||||||
|
|
||||||
basic_end_to_end_test(compressor, block_size_bits, file_order, true, true,
|
basic_end_to_end_test(compressor, block_size_bits, file_order, true, true,
|
||||||
false, false, false, false, false, true, true, true,
|
false, false, false, false, false, true, true, true,
|
||||||
true, true, true, true, false, false);
|
true, true, true, true, false, false,
|
||||||
|
default_file_hash_algo);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_P(scanner_test, end_to_end) {
|
TEST_P(scanner_test, end_to_end) {
|
||||||
auto [with_devices, with_specials, set_uid, set_gid, set_time, keep_all_times,
|
auto [with_devices, with_specials, set_uid, set_gid, set_time, keep_all_times,
|
||||||
enable_nlink] = GetParam();
|
enable_nlink, file_hash_algo] = GetParam();
|
||||||
|
|
||||||
basic_end_to_end_test(compressions[0], 15, file_order_mode::NONE,
|
basic_end_to_end_test(compressions[0], 15, file_order_mode::NONE,
|
||||||
with_devices, with_specials, set_uid, set_gid, set_time,
|
with_devices, with_specials, set_uid, set_gid, set_time,
|
||||||
keep_all_times, enable_nlink, true, true, true, true,
|
keep_all_times, enable_nlink, true, true, true, true,
|
||||||
true, true, true, false, false);
|
true, true, true, false, false, file_hash_algo);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_P(hashing_test, end_to_end) {
|
||||||
|
basic_end_to_end_test(compressions[0], 15, file_order_mode::NONE, true, true,
|
||||||
|
true, true, true, true, true, true, true, true, true,
|
||||||
|
true, true, true, false, false, GetParam());
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_P(packing_test, end_to_end) {
|
TEST_P(packing_test, end_to_end) {
|
||||||
@ -438,7 +451,7 @@ TEST_P(packing_test, end_to_end) {
|
|||||||
false, false, false, false, false, pack_chunk_table,
|
false, false, false, false, false, pack_chunk_table,
|
||||||
pack_directories, pack_shared_files_table, pack_names,
|
pack_directories, pack_shared_files_table, pack_names,
|
||||||
pack_names_index, pack_symlinks, pack_symlinks_index,
|
pack_names_index, pack_symlinks, pack_symlinks_index,
|
||||||
false, false);
|
false, false, default_file_hash_algo);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_P(plain_tables_test, end_to_end) {
|
TEST_P(plain_tables_test, end_to_end) {
|
||||||
@ -447,7 +460,7 @@ TEST_P(plain_tables_test, end_to_end) {
|
|||||||
basic_end_to_end_test(compressions[0], 15, file_order_mode::NONE, true, true,
|
basic_end_to_end_test(compressions[0], 15, file_order_mode::NONE, true, true,
|
||||||
false, false, false, false, false, false, false, false,
|
false, false, false, false, false, false, false, false,
|
||||||
false, false, false, false, plain_names_table,
|
false, false, false, false, plain_names_table,
|
||||||
plain_symlinks_table);
|
plain_symlinks_table, default_file_hash_algo);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_P(packing_test, regression_empty_fs) {
|
TEST_P(packing_test, regression_empty_fs) {
|
||||||
@ -516,7 +529,11 @@ INSTANTIATE_TEST_SUITE_P(
|
|||||||
dwarfs, scanner_test,
|
dwarfs, scanner_test,
|
||||||
::testing::Combine(::testing::Bool(), ::testing::Bool(), ::testing::Bool(),
|
::testing::Combine(::testing::Bool(), ::testing::Bool(), ::testing::Bool(),
|
||||||
::testing::Bool(), ::testing::Bool(), ::testing::Bool(),
|
::testing::Bool(), ::testing::Bool(), ::testing::Bool(),
|
||||||
::testing::Bool()));
|
::testing::Bool(),
|
||||||
|
::testing::Values(std::nullopt, "xxh3-128", "sha512")));
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_SUITE_P(dwarfs, hashing_test,
|
||||||
|
::testing::ValuesIn(checksum::available_algorithms()));
|
||||||
|
|
||||||
INSTANTIATE_TEST_SUITE_P(
|
INSTANTIATE_TEST_SUITE_P(
|
||||||
dwarfs, packing_test,
|
dwarfs, packing_test,
|
||||||
@ -661,7 +678,12 @@ TEST_P(compression_regression, github45) {
|
|||||||
INSTANTIATE_TEST_SUITE_P(dwarfs, compression_regression,
|
INSTANTIATE_TEST_SUITE_P(dwarfs, compression_regression,
|
||||||
::testing::ValuesIn(compressions));
|
::testing::ValuesIn(compressions));
|
||||||
|
|
||||||
TEST(scanner, inode_ordering) {
|
class file_scanner : public testing::TestWithParam<std::optional<std::string>> {
|
||||||
|
};
|
||||||
|
|
||||||
|
TEST_P(file_scanner, inode_ordering) {
|
||||||
|
auto file_hash_algo = GetParam();
|
||||||
|
|
||||||
std::ostringstream logss;
|
std::ostringstream logss;
|
||||||
stream_logger lgr(logss); // TODO: mock
|
stream_logger lgr(logss); // TODO: mock
|
||||||
lgr.set_policy<prod_logger_policy>();
|
lgr.set_policy<prod_logger_policy>();
|
||||||
@ -670,9 +692,10 @@ TEST(scanner, inode_ordering) {
|
|||||||
auto opts = scanner_options();
|
auto opts = scanner_options();
|
||||||
|
|
||||||
opts.file_order.mode = file_order_mode::PATH;
|
opts.file_order.mode = file_order_mode::PATH;
|
||||||
|
opts.file_hash_algorithm = file_hash_algo;
|
||||||
|
|
||||||
auto input = std::make_shared<test::os_access_mock>();
|
auto input = std::make_shared<test::os_access_mock>();
|
||||||
constexpr int dim = 15;
|
constexpr int dim = 14;
|
||||||
|
|
||||||
input->add_dir("");
|
input->add_dir("");
|
||||||
|
|
||||||
@ -693,3 +716,6 @@ TEST(scanner, inode_ordering) {
|
|||||||
EXPECT_EQ(ref, build_dwarfs(lgr, input, "null", bmcfg, opts));
|
EXPECT_EQ(ref, build_dwarfs(lgr, input, "null", bmcfg, opts));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_SUITE_P(dwarfs, file_scanner,
|
||||||
|
::testing::Values(std::nullopt, "xxh3-128"));
|
||||||
|
Loading…
x
Reference in New Issue
Block a user