mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-10 13:04:15 -04:00
Add support for nilsimsa-based inode ordering
Also refactor inode manager to take full control of ordering. Nilsimsa ordering is really slow, but has the advantage that inodes are added continuously while ordering. This allows us to perform the bulk of the ordering in the background while already running segmenting/compression.
This commit is contained in:
parent
4f22043279
commit
63c9e9a3c3
@ -109,9 +109,8 @@ class entry : public entry_interface {
|
|||||||
class file : public entry {
|
class file : public entry {
|
||||||
public:
|
public:
|
||||||
file(const std::string& name, std::shared_ptr<entry> parent,
|
file(const std::string& name, std::shared_ptr<entry> parent,
|
||||||
const struct ::stat& st, bool with_similarity)
|
const struct ::stat& st)
|
||||||
: entry(name, parent, st)
|
: entry(name, parent, st) {}
|
||||||
, with_similarity_(with_similarity) {}
|
|
||||||
|
|
||||||
type_t type() const override;
|
type_t type() const override;
|
||||||
std::string_view hash() const;
|
std::string_view hash() const;
|
||||||
@ -119,14 +118,11 @@ class file : public entry {
|
|||||||
std::shared_ptr<inode> get_inode() const;
|
std::shared_ptr<inode> get_inode() const;
|
||||||
void accept(entry_visitor& v, bool preorder) override;
|
void accept(entry_visitor& v, bool preorder) override;
|
||||||
uint32_t inode_num() const override;
|
uint32_t inode_num() const override;
|
||||||
uint32_t similarity_hash() const { return similarity_hash_; }
|
|
||||||
void scan(os_access& os, progress& prog) override;
|
void scan(os_access& os, progress& prog) override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
using hash_type = std::array<char, 20>;
|
using hash_type = std::array<char, 20>;
|
||||||
|
|
||||||
uint32_t similarity_hash_{0};
|
|
||||||
const bool with_similarity_;
|
|
||||||
hash_type hash_{0};
|
hash_type hash_{0};
|
||||||
std::shared_ptr<inode> inode_;
|
std::shared_ptr<inode> inode_;
|
||||||
};
|
};
|
||||||
@ -195,7 +191,7 @@ class device : public entry {
|
|||||||
|
|
||||||
class entry_factory {
|
class entry_factory {
|
||||||
public:
|
public:
|
||||||
static std::unique_ptr<entry_factory> create(bool with_similarity = false);
|
static std::unique_ptr<entry_factory> create();
|
||||||
|
|
||||||
virtual ~entry_factory() = default;
|
virtual ~entry_factory() = default;
|
||||||
|
|
||||||
|
@ -35,15 +35,20 @@ struct chunk;
|
|||||||
}
|
}
|
||||||
|
|
||||||
class file;
|
class file;
|
||||||
|
class os_access;
|
||||||
|
|
||||||
|
struct inode_options;
|
||||||
|
|
||||||
class inode : public object {
|
class inode : public object {
|
||||||
public:
|
public:
|
||||||
using files_vector = folly::small_vector<file*, 1>;
|
using files_vector = folly::small_vector<file*, 1>;
|
||||||
|
|
||||||
virtual void set_files(files_vector&& fv) = 0;
|
virtual void set_files(files_vector&& fv) = 0;
|
||||||
|
virtual void scan(os_access& os, inode_options const& options) = 0;
|
||||||
virtual void set_num(uint32_t num) = 0;
|
virtual void set_num(uint32_t num) = 0;
|
||||||
virtual uint32_t num() const = 0;
|
virtual uint32_t num() const = 0;
|
||||||
virtual uint32_t similarity_hash() const = 0;
|
virtual uint32_t similarity_hash() const = 0;
|
||||||
|
virtual std::vector<uint64_t> const& nilsimsa_similarity_hash() const = 0;
|
||||||
virtual size_t size() const = 0;
|
virtual size_t size() const = 0;
|
||||||
virtual file const* any() const = 0;
|
virtual file const* any() const = 0;
|
||||||
virtual files_vector const& files() const = 0;
|
virtual files_vector const& files() const = 0;
|
||||||
|
@ -25,23 +25,45 @@
|
|||||||
#include <functional>
|
#include <functional>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
|
#include "dwarfs/options.h"
|
||||||
|
|
||||||
namespace dwarfs {
|
namespace dwarfs {
|
||||||
|
|
||||||
class inode;
|
class inode;
|
||||||
|
class logger;
|
||||||
class script;
|
class script;
|
||||||
|
|
||||||
class inode_manager {
|
class inode_manager {
|
||||||
public:
|
public:
|
||||||
static std::unique_ptr<inode_manager> create();
|
using inode_cb = std::function<void(std::shared_ptr<inode> const&)>;
|
||||||
|
|
||||||
|
inode_manager(logger& lgr);
|
||||||
|
|
||||||
|
std::shared_ptr<inode> create_inode() { return impl_->create_inode(); }
|
||||||
|
|
||||||
|
size_t count() const { return impl_->count(); }
|
||||||
|
|
||||||
|
void order_inodes(std::shared_ptr<script> scr, file_order_mode file_order,
|
||||||
|
uint32_t first_inode, inode_cb const& fn) {
|
||||||
|
impl_->order_inodes(std::move(scr), file_order, first_inode, fn);
|
||||||
|
}
|
||||||
|
|
||||||
|
void for_each_inode(inode_cb const& fn) const { impl_->for_each_inode(fn); }
|
||||||
|
|
||||||
|
class impl {
|
||||||
|
public:
|
||||||
|
virtual ~impl() = default;
|
||||||
|
|
||||||
virtual ~inode_manager() = default;
|
|
||||||
virtual std::shared_ptr<inode> create_inode() = 0;
|
virtual std::shared_ptr<inode> create_inode() = 0;
|
||||||
virtual size_t count() const = 0;
|
virtual size_t count() const = 0;
|
||||||
virtual void order_inodes() = 0;
|
virtual void
|
||||||
virtual void order_inodes(std::shared_ptr<script> scr) = 0;
|
order_inodes(std::shared_ptr<script> scr, file_order_mode file_order,
|
||||||
virtual void order_inodes_by_similarity() = 0;
|
uint32_t first_inode, inode_cb const& fn) = 0;
|
||||||
virtual void number_inodes(size_t first_no) = 0;
|
|
||||||
virtual void for_each_inode(
|
virtual void for_each_inode(
|
||||||
std::function<void(std::shared_ptr<inode> const&)> const& fn) const = 0;
|
std::function<void(std::shared_ptr<inode> const&)> const& fn) const = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::unique_ptr<impl> impl_;
|
||||||
};
|
};
|
||||||
} // namespace dwarfs
|
} // namespace dwarfs
|
||||||
|
@ -46,7 +46,14 @@ struct filesystem_options {
|
|||||||
metadata_options metadata;
|
metadata_options metadata;
|
||||||
};
|
};
|
||||||
|
|
||||||
enum class file_order_mode { NONE, PATH, SCRIPT, SIMILARITY };
|
struct inode_options {
|
||||||
|
bool with_similarity{false};
|
||||||
|
bool with_nilsimsa{false};
|
||||||
|
|
||||||
|
bool needs_scan() const { return with_similarity || with_nilsimsa; }
|
||||||
|
};
|
||||||
|
|
||||||
|
enum class file_order_mode { NONE, PATH, SCRIPT, SIMILARITY, NILSIMSA };
|
||||||
|
|
||||||
struct scanner_options {
|
struct scanner_options {
|
||||||
file_order_mode file_order{file_order_mode::NONE};
|
file_order_mode file_order{file_order_mode::NONE};
|
||||||
@ -54,6 +61,7 @@ struct scanner_options {
|
|||||||
std::optional<uint16_t> gid;
|
std::optional<uint16_t> gid;
|
||||||
std::optional<uint64_t> timestamp;
|
std::optional<uint64_t> timestamp;
|
||||||
bool remove_empty_dirs{false};
|
bool remove_empty_dirs{false};
|
||||||
|
inode_options inode;
|
||||||
};
|
};
|
||||||
|
|
||||||
std::ostream& operator<<(std::ostream& os, file_order_mode mode);
|
std::ostream& operator<<(std::ostream& os, file_order_mode mode);
|
||||||
|
@ -32,9 +32,10 @@
|
|||||||
#include "dwarfs/global_entry_data.h"
|
#include "dwarfs/global_entry_data.h"
|
||||||
#include "dwarfs/inode.h"
|
#include "dwarfs/inode.h"
|
||||||
#include "dwarfs/mmif.h"
|
#include "dwarfs/mmif.h"
|
||||||
|
#include "dwarfs/nilsimsa.h"
|
||||||
|
#include "dwarfs/options.h"
|
||||||
#include "dwarfs/os_access.h"
|
#include "dwarfs/os_access.h"
|
||||||
#include "dwarfs/progress.h"
|
#include "dwarfs/progress.h"
|
||||||
#include "dwarfs/similarity.h"
|
|
||||||
|
|
||||||
#include "dwarfs/gen-cpp2/metadata_types.h"
|
#include "dwarfs/gen-cpp2/metadata_types.h"
|
||||||
|
|
||||||
@ -168,10 +169,6 @@ void file::scan(os_access& os, progress& prog) {
|
|||||||
auto mm = os.map_file(path(), s);
|
auto mm = os.map_file(path(), s);
|
||||||
::SHA1(mm->as<unsigned char>(), s,
|
::SHA1(mm->as<unsigned char>(), s,
|
||||||
reinterpret_cast<unsigned char*>(&hash_[0]));
|
reinterpret_cast<unsigned char*>(&hash_[0]));
|
||||||
|
|
||||||
if (with_similarity_) {
|
|
||||||
similarity_hash_ = get_similarity_hash(mm->as<uint8_t>(), s);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
::SHA1(nullptr, 0, reinterpret_cast<unsigned char*>(&hash_[0]));
|
::SHA1(nullptr, 0, reinterpret_cast<unsigned char*>(&hash_[0]));
|
||||||
}
|
}
|
||||||
@ -290,9 +287,6 @@ uint64_t device::device_id() const { return status().st_rdev; }
|
|||||||
|
|
||||||
class entry_factory_ : public entry_factory {
|
class entry_factory_ : public entry_factory {
|
||||||
public:
|
public:
|
||||||
entry_factory_(bool with_similarity)
|
|
||||||
: with_similarity_(with_similarity) {}
|
|
||||||
|
|
||||||
std::shared_ptr<entry> create(os_access& os, const std::string& name,
|
std::shared_ptr<entry> create(os_access& os, const std::string& name,
|
||||||
std::shared_ptr<entry> parent) override {
|
std::shared_ptr<entry> parent) override {
|
||||||
const std::string& p = parent ? parent->path() + "/" + name : name;
|
const std::string& p = parent ? parent->path() + "/" + name : name;
|
||||||
@ -302,8 +296,7 @@ class entry_factory_ : public entry_factory {
|
|||||||
auto mode = st.st_mode;
|
auto mode = st.st_mode;
|
||||||
|
|
||||||
if (S_ISREG(mode)) {
|
if (S_ISREG(mode)) {
|
||||||
return std::make_shared<file>(name, std::move(parent), st,
|
return std::make_shared<file>(name, std::move(parent), st);
|
||||||
with_similarity_);
|
|
||||||
} else if (S_ISDIR(mode)) {
|
} else if (S_ISDIR(mode)) {
|
||||||
return std::make_shared<dir>(name, std::move(parent), st);
|
return std::make_shared<dir>(name, std::move(parent), st);
|
||||||
} else if (S_ISLNK(mode)) {
|
} else if (S_ISLNK(mode)) {
|
||||||
@ -317,12 +310,9 @@ class entry_factory_ : public entry_factory {
|
|||||||
|
|
||||||
return std::shared_ptr<entry>();
|
return std::shared_ptr<entry>();
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
|
||||||
const bool with_similarity_;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
std::unique_ptr<entry_factory> entry_factory::create(bool with_similarity) {
|
std::unique_ptr<entry_factory> entry_factory::create() {
|
||||||
return std::make_unique<entry_factory_>(with_similarity);
|
return std::make_unique<entry_factory_>();
|
||||||
}
|
}
|
||||||
} // namespace dwarfs
|
} // namespace dwarfs
|
||||||
|
@ -20,7 +20,9 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <cassert>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
#include <deque>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include <numeric>
|
#include <numeric>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
@ -30,34 +32,69 @@
|
|||||||
#include "dwarfs/entry.h"
|
#include "dwarfs/entry.h"
|
||||||
#include "dwarfs/inode.h"
|
#include "dwarfs/inode.h"
|
||||||
#include "dwarfs/inode_manager.h"
|
#include "dwarfs/inode_manager.h"
|
||||||
|
#include "dwarfs/logger.h"
|
||||||
|
#include "dwarfs/mmif.h"
|
||||||
|
#include "dwarfs/nilsimsa.h"
|
||||||
|
#include "dwarfs/os_access.h"
|
||||||
#include "dwarfs/script.h"
|
#include "dwarfs/script.h"
|
||||||
|
#include "dwarfs/similarity.h"
|
||||||
|
|
||||||
#include "dwarfs/gen-cpp2/metadata_types.h"
|
#include "dwarfs/gen-cpp2/metadata_types.h"
|
||||||
|
|
||||||
namespace dwarfs {
|
namespace dwarfs {
|
||||||
|
|
||||||
class inode_manager_ : public inode_manager {
|
namespace {
|
||||||
private:
|
|
||||||
class inode_ : public inode {
|
class inode_ : public inode {
|
||||||
public:
|
public:
|
||||||
using chunk_type = thrift::metadata::chunk;
|
using chunk_type = thrift::metadata::chunk;
|
||||||
|
|
||||||
void set_num(uint32_t num) override { num_ = num; }
|
void set_num(uint32_t num) override { num_ = num; }
|
||||||
|
|
||||||
uint32_t num() const override { return num_; }
|
uint32_t num() const override { return num_; }
|
||||||
|
|
||||||
uint32_t similarity_hash() const override {
|
uint32_t similarity_hash() const override {
|
||||||
if (files_.empty()) {
|
if (files_.empty()) {
|
||||||
throw std::runtime_error("inode has no file");
|
throw std::runtime_error("inode has no file");
|
||||||
}
|
}
|
||||||
return files_.front()->similarity_hash();
|
return similarity_hash_;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<uint64_t> const& nilsimsa_similarity_hash() const override {
|
||||||
|
if (files_.empty()) {
|
||||||
|
throw std::runtime_error("inode has no file");
|
||||||
|
}
|
||||||
|
return nilsimsa_similarity_hash_;
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_files(files_vector&& fv) override {
|
void set_files(files_vector&& fv) override {
|
||||||
if (!files_.empty()) {
|
if (!files_.empty()) {
|
||||||
throw std::runtime_error("files already set for inode");
|
throw std::runtime_error("files already set for inode");
|
||||||
}
|
}
|
||||||
|
|
||||||
files_ = std::move(fv);
|
files_ = std::move(fv);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void scan(os_access& os, inode_options const& opts) override {
|
||||||
|
if (opts.needs_scan()) {
|
||||||
|
auto file = files_.front();
|
||||||
|
auto size = file->size();
|
||||||
|
|
||||||
|
if (size > 0) {
|
||||||
|
auto mm = os.map_file(file->path(), size);
|
||||||
|
auto data = mm->as<uint8_t>();
|
||||||
|
|
||||||
|
if (opts.with_similarity) {
|
||||||
|
similarity_hash_ = get_similarity_hash(data, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (opts.with_nilsimsa) {
|
||||||
|
nilsimsa_similarity_hash_ = nilsimsa_compute_hash(data, size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void add_chunk(size_t block, size_t offset, size_t size) override {
|
void add_chunk(size_t block, size_t offset, size_t size) override {
|
||||||
chunk_type c;
|
chunk_type c;
|
||||||
c.block = block;
|
c.block = block;
|
||||||
@ -83,11 +120,37 @@ class inode_manager_ : public inode_manager {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
uint32_t num_{std::numeric_limits<uint32_t>::max()};
|
uint32_t num_{std::numeric_limits<uint32_t>::max()};
|
||||||
|
uint32_t similarity_hash_{0};
|
||||||
files_vector files_;
|
files_vector files_;
|
||||||
std::vector<chunk_type> chunks_;
|
std::vector<chunk_type> chunks_;
|
||||||
};
|
std::vector<uint64_t> nilsimsa_similarity_hash_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class nilsimsa_cache_entry {
|
||||||
public:
|
public:
|
||||||
|
nilsimsa_cache_entry(std::shared_ptr<inode> i)
|
||||||
|
: size(i->size())
|
||||||
|
, hash(i->nilsimsa_similarity_hash().data())
|
||||||
|
, path(i->any()->path())
|
||||||
|
, ino(std::move(i)) {
|
||||||
|
assert(hash);
|
||||||
|
}
|
||||||
|
|
||||||
|
int similarity{0};
|
||||||
|
uint64_t const size;
|
||||||
|
uint64_t const* const hash;
|
||||||
|
std::string const path;
|
||||||
|
std::shared_ptr<inode> ino;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
template <typename LoggerPolicy>
|
||||||
|
class inode_manager_ : public inode_manager::impl {
|
||||||
|
public:
|
||||||
|
inode_manager_(logger& lgr)
|
||||||
|
: log_(lgr) {}
|
||||||
|
|
||||||
std::shared_ptr<inode> create_inode() override {
|
std::shared_ptr<inode> create_inode() override {
|
||||||
auto ino = std::make_shared<inode_>();
|
auto ino = std::make_shared<inode_>();
|
||||||
inodes_.push_back(ino);
|
inodes_.push_back(ino);
|
||||||
@ -96,11 +159,59 @@ class inode_manager_ : public inode_manager {
|
|||||||
|
|
||||||
size_t count() const override { return inodes_.size(); }
|
size_t count() const override { return inodes_.size(); }
|
||||||
|
|
||||||
void order_inodes(std::shared_ptr<script> scr) override {
|
void order_inodes(std::shared_ptr<script> scr, file_order_mode file_order,
|
||||||
scr->order(inodes_);
|
uint32_t first_inode,
|
||||||
|
inode_manager::inode_cb const& fn) override {
|
||||||
|
switch (file_order) {
|
||||||
|
case file_order_mode::NONE:
|
||||||
|
log_.info() << "keeping inode order";
|
||||||
|
break;
|
||||||
|
|
||||||
|
case file_order_mode::PATH: {
|
||||||
|
log_.info() << "ordering " << count() << " inodes by path name...";
|
||||||
|
auto ti = log_.timed_info();
|
||||||
|
order_inodes_by_path();
|
||||||
|
ti << count() << " inodes ordered";
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
void order_inodes() override {
|
case file_order_mode::SCRIPT: {
|
||||||
|
if (!scr->has_order()) {
|
||||||
|
throw std::runtime_error("script cannot order inodes");
|
||||||
|
}
|
||||||
|
log_.info() << "ordering " << count() << " inodes using script...";
|
||||||
|
auto ti = log_.timed_info();
|
||||||
|
scr->order(inodes_);
|
||||||
|
ti << count() << " inodes ordered";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case file_order_mode::SIMILARITY: {
|
||||||
|
log_.info() << "ordering " << count() << " inodes by similarity...";
|
||||||
|
auto ti = log_.timed_info();
|
||||||
|
order_inodes_by_similarity();
|
||||||
|
ti << count() << " inodes ordered";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case file_order_mode::NILSIMSA: {
|
||||||
|
log_.info() << "ordering " << count()
|
||||||
|
<< " inodes using nilsimsa similarity...";
|
||||||
|
auto ti = log_.timed_info();
|
||||||
|
order_inodes_by_nilsimsa(fn, first_inode);
|
||||||
|
ti << count() << " inodes ordered";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (file_order != file_order_mode::NILSIMSA) {
|
||||||
|
log_.info() << "assigning file inodes...";
|
||||||
|
number_inodes(first_inode);
|
||||||
|
for_each_inode(fn);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void order_inodes_by_path() {
|
||||||
std::vector<std::string> paths;
|
std::vector<std::string> paths;
|
||||||
std::vector<size_t> index(inodes_.size());
|
std::vector<size_t> index(inodes_.size());
|
||||||
|
|
||||||
@ -125,7 +236,7 @@ class inode_manager_ : public inode_manager {
|
|||||||
inodes_.swap(tmp);
|
inodes_.swap(tmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
void order_inodes_by_similarity() override {
|
void order_inodes_by_similarity() {
|
||||||
std::sort(
|
std::sort(
|
||||||
inodes_.begin(), inodes_.end(),
|
inodes_.begin(), inodes_.end(),
|
||||||
[](const std::shared_ptr<inode>& a, const std::shared_ptr<inode>& b) {
|
[](const std::shared_ptr<inode>& a, const std::shared_ptr<inode>& b) {
|
||||||
@ -138,7 +249,131 @@ class inode_manager_ : public inode_manager {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
void number_inodes(size_t first_no) override {
|
void order_inodes_by_nilsimsa(inode_manager::inode_cb const& fn,
|
||||||
|
uint32_t inode_no) {
|
||||||
|
auto finalize_inode = [&](auto& ino) {
|
||||||
|
ino->set_num(inode_no++);
|
||||||
|
fn(ino);
|
||||||
|
};
|
||||||
|
|
||||||
|
auto count = inodes_.size();
|
||||||
|
|
||||||
|
// skip all empty inodes (this is at most one)
|
||||||
|
auto beg = std::partition(inodes_.begin(), inodes_.end(),
|
||||||
|
[](auto const& p) { return p->size() == 0; });
|
||||||
|
|
||||||
|
for (auto it = inodes_.begin(); it != beg; ++it) {
|
||||||
|
finalize_inode(*it);
|
||||||
|
}
|
||||||
|
|
||||||
|
// find the largest inode
|
||||||
|
std::nth_element(beg, beg, inodes_.end(), [](auto const& a, auto const& b) {
|
||||||
|
return (a->size() > b->size() ||
|
||||||
|
(a->size() == b->size() && a->any()->path() < b->any()->path()));
|
||||||
|
});
|
||||||
|
|
||||||
|
finalize_inode(*beg);
|
||||||
|
|
||||||
|
// build a cache for the remaining inodes
|
||||||
|
std::vector<nilsimsa_cache_entry> cache;
|
||||||
|
std::deque<uint32_t> index;
|
||||||
|
index.resize(std::distance(beg + 1, inodes_.end()));
|
||||||
|
std::iota(index.begin(), index.end(), 0);
|
||||||
|
cache.reserve(index.size());
|
||||||
|
|
||||||
|
for (auto it = beg + 1; it != inodes_.end(); ++it) {
|
||||||
|
cache.emplace_back(std::move(*it));
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(index.size() == cache.size());
|
||||||
|
|
||||||
|
// and temporarily remove from the original array
|
||||||
|
inodes_.erase(beg + 1, inodes_.end());
|
||||||
|
|
||||||
|
while (!index.empty()) {
|
||||||
|
// compare reference inode with all remaining inodes
|
||||||
|
auto* ref_hash = inodes_.back()->nilsimsa_similarity_hash().data();
|
||||||
|
for (auto& d : cache) {
|
||||||
|
d.similarity = dwarfs::nilsimsa_similarity(ref_hash, d.hash);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto cmp = [&cache](uint32_t a, uint32_t b) {
|
||||||
|
auto& da = cache[a];
|
||||||
|
auto& db = cache[b];
|
||||||
|
return da.similarity > db.similarity ||
|
||||||
|
(da.similarity == db.similarity &&
|
||||||
|
(da.size > db.size ||
|
||||||
|
(da.size == db.size && da.path < db.path)));
|
||||||
|
};
|
||||||
|
|
||||||
|
size_t depth = 0;
|
||||||
|
size_t depth_thresh;
|
||||||
|
const int sim_thresh_depth = 16;
|
||||||
|
const int sim_thresh = 0;
|
||||||
|
const size_t max_depth = 2000;
|
||||||
|
const size_t depth_step = 500;
|
||||||
|
|
||||||
|
if (index.size() > max_depth) {
|
||||||
|
while (depth < max_depth && depth + depth_step < index.size()) {
|
||||||
|
std::partial_sort(index.begin() + depth,
|
||||||
|
index.begin() + depth + depth_step, index.end(),
|
||||||
|
cmp);
|
||||||
|
depth += depth_step;
|
||||||
|
if (cache[index[0]].similarity - cache[index[depth - 1]].similarity >
|
||||||
|
sim_thresh_depth) {
|
||||||
|
do {
|
||||||
|
--depth;
|
||||||
|
} while (cache[index[0]].similarity -
|
||||||
|
cache[index[depth - 1]].similarity >
|
||||||
|
sim_thresh_depth);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
depth_thresh = depth / 2;
|
||||||
|
} else {
|
||||||
|
std::sort(index.begin(), index.end(), cmp);
|
||||||
|
depth = index.size();
|
||||||
|
depth_thresh = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto sim = cache[index.front()].similarity;
|
||||||
|
|
||||||
|
while (!index.empty() && depth > depth_thresh &&
|
||||||
|
sim - cache[index.front()].similarity <= sim_thresh) {
|
||||||
|
inodes_.push_back(std::move(cache[index.front()].ino));
|
||||||
|
finalize_inode(inodes_.back());
|
||||||
|
index.pop_front();
|
||||||
|
--depth;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (depth > depth_thresh) {
|
||||||
|
ref_hash = inodes_.back()->nilsimsa_similarity_hash().data();
|
||||||
|
for (size_t i = 0; i < depth; ++i) {
|
||||||
|
cache[index[i]].similarity =
|
||||||
|
dwarfs::nilsimsa_similarity(ref_hash, cache[index[i]].hash);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::partial_sort(index.begin(), index.begin() + (depth - depth_thresh),
|
||||||
|
index.begin() + depth, cmp);
|
||||||
|
|
||||||
|
sim = cache[index.front()].similarity;
|
||||||
|
|
||||||
|
while (!index.empty() && depth > depth_thresh &&
|
||||||
|
sim - cache[index.front()].similarity <= sim_thresh) {
|
||||||
|
inodes_.push_back(std::move(cache[index.front()].ino));
|
||||||
|
finalize_inode(inodes_.back());
|
||||||
|
index.pop_front();
|
||||||
|
--depth;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (count != inodes_.size()) {
|
||||||
|
throw std::runtime_error("internal error: nilsimsa ordering failed");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void number_inodes(size_t first_no) {
|
||||||
for (auto& i : inodes_) {
|
for (auto& i : inodes_) {
|
||||||
i->set_num(first_no++);
|
i->set_num(first_no++);
|
||||||
}
|
}
|
||||||
@ -154,9 +389,11 @@ class inode_manager_ : public inode_manager {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
std::vector<std::shared_ptr<inode>> inodes_;
|
std::vector<std::shared_ptr<inode>> inodes_;
|
||||||
|
log_proxy<LoggerPolicy> log_;
|
||||||
};
|
};
|
||||||
|
|
||||||
std::unique_ptr<inode_manager> inode_manager::create() {
|
inode_manager::inode_manager(logger& lgr)
|
||||||
return std::make_unique<inode_manager_>();
|
: impl_(make_unique_logging_object<impl, inode_manager_, logger_policies>(
|
||||||
}
|
lgr)) {}
|
||||||
|
|
||||||
} // namespace dwarfs
|
} // namespace dwarfs
|
||||||
|
@ -44,6 +44,9 @@ std::ostream& operator<<(std::ostream& os, file_order_mode mode) {
|
|||||||
case file_order_mode::SIMILARITY:
|
case file_order_mode::SIMILARITY:
|
||||||
modestr = "similarity";
|
modestr = "similarity";
|
||||||
break;
|
break;
|
||||||
|
case file_order_mode::NILSIMSA:
|
||||||
|
modestr = "nilsimsa";
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -37,6 +37,8 @@
|
|||||||
|
|
||||||
#include <folly/ExceptionString.h>
|
#include <folly/ExceptionString.h>
|
||||||
|
|
||||||
|
#include <fmt/format.h>
|
||||||
|
|
||||||
#include "dwarfs/entry.h"
|
#include "dwarfs/entry.h"
|
||||||
#include "dwarfs/filesystem_writer.h"
|
#include "dwarfs/filesystem_writer.h"
|
||||||
#include "dwarfs/global_entry_data.h"
|
#include "dwarfs/global_entry_data.h"
|
||||||
@ -93,7 +95,8 @@ class file_deduplication_visitor : public visitor_base {
|
|||||||
public:
|
public:
|
||||||
void visit(file* p) override { hash_[p->hash()].push_back(p); }
|
void visit(file* p) override { hash_[p->hash()].push_back(p); }
|
||||||
|
|
||||||
void deduplicate_files(inode_manager& im, progress& prog) {
|
void deduplicate_files(worker_group& wg, os_access& os, inode_manager& im,
|
||||||
|
inode_options const& ino_opts, progress& prog) {
|
||||||
for (auto& p : hash_) {
|
for (auto& p : hash_) {
|
||||||
auto& files = p.second;
|
auto& files = p.second;
|
||||||
|
|
||||||
@ -115,6 +118,10 @@ class file_deduplication_visitor : public visitor_base {
|
|||||||
}
|
}
|
||||||
|
|
||||||
inode->set_files(std::move(files));
|
inode->set_files(std::move(files));
|
||||||
|
|
||||||
|
if (ino_opts.needs_scan()) {
|
||||||
|
wg.add_job([&, inode] { inode->scan(os, ino_opts); });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -278,7 +285,6 @@ class scanner_ : public scanner::impl {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
std::shared_ptr<entry> scan_tree(const std::string& path, progress& prog);
|
std::shared_ptr<entry> scan_tree(const std::string& path, progress& prog);
|
||||||
void order_files(inode_manager& im);
|
|
||||||
|
|
||||||
const block_manager::config& cfg_;
|
const block_manager::config& cfg_;
|
||||||
const scanner_options& options_;
|
const scanner_options& options_;
|
||||||
@ -409,42 +415,6 @@ scanner_<LoggerPolicy>::scan_tree(const std::string& path, progress& prog) {
|
|||||||
return root;
|
return root;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename LoggerPolicy>
|
|
||||||
void scanner_<LoggerPolicy>::order_files(inode_manager& im) {
|
|
||||||
switch (options_.file_order) {
|
|
||||||
case file_order_mode::NONE:
|
|
||||||
log_.info() << "keeping inode order";
|
|
||||||
break;
|
|
||||||
|
|
||||||
case file_order_mode::PATH: {
|
|
||||||
log_.info() << "ordering " << im.count() << " inodes by path name...";
|
|
||||||
auto ti = log_.timed_info();
|
|
||||||
im.order_inodes();
|
|
||||||
ti << im.count() << " inodes ordered";
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
case file_order_mode::SCRIPT: {
|
|
||||||
if (!script_->has_order()) {
|
|
||||||
throw std::runtime_error("script cannot order inodes");
|
|
||||||
}
|
|
||||||
log_.info() << "ordering " << im.count() << " inodes using script...";
|
|
||||||
auto ti = log_.timed_info();
|
|
||||||
im.order_inodes(script_);
|
|
||||||
ti << im.count() << " inodes ordered";
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
case file_order_mode::SIMILARITY: {
|
|
||||||
log_.info() << "ordering " << im.count() << " inodes by similarity...";
|
|
||||||
auto ti = log_.timed_info();
|
|
||||||
im.order_inodes_by_similarity();
|
|
||||||
ti << im.count() << " inodes ordered";
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename LoggerPolicy>
|
template <typename LoggerPolicy>
|
||||||
void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
||||||
const std::string& path, progress& prog) {
|
const std::string& path, progress& prog) {
|
||||||
@ -479,22 +449,22 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
|||||||
|
|
||||||
log_.info() << "finding duplicate files...";
|
log_.info() << "finding duplicate files...";
|
||||||
|
|
||||||
auto im = inode_manager::create();
|
inode_manager im(lgr_);
|
||||||
|
|
||||||
file_deduplication_visitor fdv;
|
file_deduplication_visitor fdv;
|
||||||
root->accept(fdv);
|
root->accept(fdv);
|
||||||
|
|
||||||
fdv.deduplicate_files(*im, prog);
|
fdv.deduplicate_files(wg_, *os_, im, options_.inode, prog);
|
||||||
|
|
||||||
log_.info() << "saved " << size_with_unit(prog.saved_by_deduplication)
|
log_.info() << "saved " << size_with_unit(prog.saved_by_deduplication)
|
||||||
<< " / " << size_with_unit(prog.original_size) << " in "
|
<< " / " << size_with_unit(prog.original_size) << " in "
|
||||||
<< prog.duplicate_files << "/" << prog.files_found
|
<< prog.duplicate_files << "/" << prog.files_found
|
||||||
<< " duplicate files";
|
<< " duplicate files";
|
||||||
|
|
||||||
order_files(*im);
|
if (options_.inode.needs_scan()) {
|
||||||
|
log_.info() << "waiting for inode scanners...";
|
||||||
log_.info() << "assigning file inodes...";
|
wg_.wait();
|
||||||
im->number_inodes(first_file_inode);
|
}
|
||||||
|
|
||||||
global_entry_data ge_data(options_);
|
global_entry_data ge_data(options_);
|
||||||
thrift::metadata::metadata mv2;
|
thrift::metadata::metadata mv2;
|
||||||
@ -502,7 +472,7 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
|||||||
mv2.link_index.resize(first_file_inode - first_link_inode);
|
mv2.link_index.resize(first_file_inode - first_link_inode);
|
||||||
|
|
||||||
log_.info() << "assigning device inodes...";
|
log_.info() << "assigning device inodes...";
|
||||||
uint32_t first_device_inode = first_file_inode + im->count();
|
uint32_t first_device_inode = first_file_inode + im.count();
|
||||||
device_set_inode_visitor devsiv(first_device_inode);
|
device_set_inode_visitor devsiv(first_device_inode);
|
||||||
root->accept(devsiv);
|
root->accept(devsiv);
|
||||||
mv2.devices_ref() = std::move(devsiv.device_ids());
|
mv2.devices_ref() = std::move(devsiv.device_ids());
|
||||||
@ -534,7 +504,8 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
|||||||
log_.info() << "building blocks...";
|
log_.info() << "building blocks...";
|
||||||
block_manager bm(lgr_, prog, cfg_, os_, fsw);
|
block_manager bm(lgr_, prog, cfg_, os_, fsw);
|
||||||
|
|
||||||
im->for_each_inode([&](std::shared_ptr<inode> const& ino) {
|
im.order_inodes(script_, options_.file_order, first_file_inode,
|
||||||
|
[&](std::shared_ptr<inode> const& ino) {
|
||||||
prog.current.store(ino.get());
|
prog.current.store(ino.get());
|
||||||
bm.add_inode(ino);
|
bm.add_inode(ino);
|
||||||
prog.inodes_written++;
|
prog.inodes_written++;
|
||||||
@ -567,19 +538,19 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
|||||||
root->set_name(std::string());
|
root->set_name(std::string());
|
||||||
|
|
||||||
log_.info() << "saving chunks...";
|
log_.info() << "saving chunks...";
|
||||||
mv2.chunk_index.resize(im->count() + 1);
|
mv2.chunk_index.resize(im.count() + 1);
|
||||||
|
|
||||||
// TODO: we should be able to start this once all blocks have been
|
// TODO: we should be able to start this once all blocks have been
|
||||||
// submitted for compression
|
// submitted for compression
|
||||||
im->for_each_inode([&](std::shared_ptr<inode> const& ino) {
|
im.for_each_inode([&](std::shared_ptr<inode> const& ino) {
|
||||||
mv2.chunk_index.at(ino->num() - first_file_inode) = mv2.chunks.size();
|
mv2.chunk_index.at(ino->num() - first_file_inode) = mv2.chunks.size();
|
||||||
ino->append_chunks_to(mv2.chunks);
|
ino->append_chunks_to(mv2.chunks);
|
||||||
});
|
});
|
||||||
|
|
||||||
// insert dummy inode to help determine number of chunks per inode
|
// insert dummy inode to help determine number of chunks per inode
|
||||||
mv2.chunk_index.at(im->count()) = mv2.chunks.size();
|
mv2.chunk_index.at(im.count()) = mv2.chunks.size();
|
||||||
|
|
||||||
log_.debug() << "total number of file inodes: " << im->count();
|
log_.debug() << "total number of file inodes: " << im.count();
|
||||||
log_.debug() << "total number of chunks: " << mv2.chunks.size();
|
log_.debug() << "total number of chunks: " << mv2.chunks.size();
|
||||||
|
|
||||||
log_.info() << "saving directories...";
|
log_.info() << "saving directories...";
|
||||||
|
@ -95,7 +95,8 @@ const std::map<std::string, file_order_mode> order_choices{
|
|||||||
#ifdef DWARFS_HAVE_PYTHON
|
#ifdef DWARFS_HAVE_PYTHON
|
||||||
{"script", file_order_mode::SCRIPT},
|
{"script", file_order_mode::SCRIPT},
|
||||||
#endif
|
#endif
|
||||||
{"similarity", file_order_mode::SIMILARITY}};
|
{"similarity", file_order_mode::SIMILARITY},
|
||||||
|
{"nilsimsa", file_order_mode::NILSIMSA}};
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
@ -555,10 +556,12 @@ int mkdwarfs(int argc, char** argv) {
|
|||||||
wg_writer.wait();
|
wg_writer.wait();
|
||||||
ti << "filesystem rewritten";
|
ti << "filesystem rewritten";
|
||||||
} else {
|
} else {
|
||||||
scanner s(lgr, wg_scanner, cfg,
|
options.inode.with_similarity =
|
||||||
entry_factory::create(force_similarity ||
|
force_similarity || options.file_order == file_order_mode::SIMILARITY;
|
||||||
options.file_order ==
|
options.inode.with_nilsimsa =
|
||||||
file_order_mode::SIMILARITY),
|
options.file_order == file_order_mode::NILSIMSA;
|
||||||
|
|
||||||
|
scanner s(lgr, wg_scanner, cfg, entry_factory::create(),
|
||||||
std::make_shared<os_access_posix>(), std::move(script), options);
|
std::make_shared<os_access_posix>(), std::move(script), options);
|
||||||
|
|
||||||
{
|
{
|
||||||
|
@ -199,6 +199,8 @@ void basic_end_to_end_test(const std::string& compressor,
|
|||||||
cfg.block_size_bits = block_size_bits;
|
cfg.block_size_bits = block_size_bits;
|
||||||
|
|
||||||
options.file_order = file_order;
|
options.file_order = file_order;
|
||||||
|
options.inode.with_similarity = file_order == file_order_mode::SIMILARITY;
|
||||||
|
options.inode.with_nilsimsa = file_order == file_order_mode::NILSIMSA;
|
||||||
|
|
||||||
// force multithreading
|
// force multithreading
|
||||||
worker_group wg("writer", 4);
|
worker_group wg("writer", 4);
|
||||||
@ -207,8 +209,7 @@ void basic_end_to_end_test(const std::string& compressor,
|
|||||||
stream_logger lgr(logss); // TODO: mock
|
stream_logger lgr(logss); // TODO: mock
|
||||||
lgr.set_policy<prod_logger_policy>();
|
lgr.set_policy<prod_logger_policy>();
|
||||||
|
|
||||||
scanner s(lgr, wg, cfg,
|
scanner s(lgr, wg, cfg, entry_factory::create(),
|
||||||
entry_factory::create(file_order == file_order_mode::SIMILARITY),
|
|
||||||
std::make_shared<test::os_access_mock>(),
|
std::make_shared<test::os_access_mock>(),
|
||||||
std::make_shared<test::script_mock>(), options);
|
std::make_shared<test::script_mock>(), options);
|
||||||
|
|
||||||
@ -323,9 +324,8 @@ TEST_P(basic, end_to_end) {
|
|||||||
|
|
||||||
INSTANTIATE_TEST_SUITE_P(
|
INSTANTIATE_TEST_SUITE_P(
|
||||||
dwarfs, basic,
|
dwarfs, basic,
|
||||||
::testing::Combine(::testing::ValuesIn(compressions),
|
::testing::Combine(
|
||||||
::testing::Values(12, 15, 20, 28),
|
::testing::ValuesIn(compressions), ::testing::Values(12, 15, 20, 28),
|
||||||
::testing::Values(file_order_mode::NONE,
|
::testing::Values(file_order_mode::NONE, file_order_mode::PATH,
|
||||||
file_order_mode::PATH,
|
file_order_mode::SCRIPT, file_order_mode::NILSIMSA,
|
||||||
file_order_mode::SCRIPT,
|
|
||||||
file_order_mode::SIMILARITY)));
|
file_order_mode::SIMILARITY)));
|
||||||
|
Loading…
x
Reference in New Issue
Block a user