mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-09 20:41:04 -04:00
Add nilsimsa2 order mode (will replace nilsimsa after more testing)
This commit is contained in:
parent
f8164c9e3a
commit
14aed67ade
@ -25,14 +25,14 @@
|
|||||||
#include <functional>
|
#include <functional>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
#include "dwarfs/options.h"
|
|
||||||
|
|
||||||
namespace dwarfs {
|
namespace dwarfs {
|
||||||
|
|
||||||
class inode;
|
class inode;
|
||||||
class logger;
|
class logger;
|
||||||
class script;
|
class script;
|
||||||
|
|
||||||
|
struct file_order_options;
|
||||||
|
|
||||||
class inode_manager {
|
class inode_manager {
|
||||||
public:
|
public:
|
||||||
using inode_cb = std::function<void(std::shared_ptr<inode> const&)>;
|
using inode_cb = std::function<void(std::shared_ptr<inode> const&)>;
|
||||||
@ -43,8 +43,9 @@ class inode_manager {
|
|||||||
|
|
||||||
size_t count() const { return impl_->count(); }
|
size_t count() const { return impl_->count(); }
|
||||||
|
|
||||||
void order_inodes(std::shared_ptr<script> scr, file_order_mode file_order,
|
void order_inodes(std::shared_ptr<script> scr,
|
||||||
uint32_t first_inode, inode_cb const& fn) {
|
file_order_options const& file_order, uint32_t first_inode,
|
||||||
|
inode_cb const& fn) {
|
||||||
impl_->order_inodes(std::move(scr), file_order, first_inode, fn);
|
impl_->order_inodes(std::move(scr), file_order, first_inode, fn);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -56,9 +57,9 @@ class inode_manager {
|
|||||||
|
|
||||||
virtual std::shared_ptr<inode> create_inode() = 0;
|
virtual std::shared_ptr<inode> create_inode() = 0;
|
||||||
virtual size_t count() const = 0;
|
virtual size_t count() const = 0;
|
||||||
virtual void
|
virtual void order_inodes(std::shared_ptr<script> scr,
|
||||||
order_inodes(std::shared_ptr<script> scr, file_order_mode file_order,
|
file_order_options const& file_order,
|
||||||
uint32_t first_inode, inode_cb const& fn) = 0;
|
uint32_t first_inode, inode_cb const& fn) = 0;
|
||||||
virtual void for_each_inode(
|
virtual void for_each_inode(
|
||||||
std::function<void(std::shared_ptr<inode> const&)> const& fn) const = 0;
|
std::function<void(std::shared_ptr<inode> const&)> const& fn) const = 0;
|
||||||
};
|
};
|
||||||
|
@ -54,10 +54,23 @@ struct inode_options {
|
|||||||
bool needs_scan() const { return with_similarity || with_nilsimsa; }
|
bool needs_scan() const { return with_similarity || with_nilsimsa; }
|
||||||
};
|
};
|
||||||
|
|
||||||
enum class file_order_mode { NONE, PATH, SCRIPT, SIMILARITY, NILSIMSA };
|
enum class file_order_mode {
|
||||||
|
NONE,
|
||||||
|
PATH,
|
||||||
|
SCRIPT,
|
||||||
|
SIMILARITY,
|
||||||
|
NILSIMSA,
|
||||||
|
NILSIMSA2
|
||||||
|
};
|
||||||
|
|
||||||
|
struct file_order_options {
|
||||||
|
file_order_mode mode{file_order_mode::NONE};
|
||||||
|
int nilsimsa_depth{10000};
|
||||||
|
int nilsimsa_limit{250};
|
||||||
|
};
|
||||||
|
|
||||||
struct scanner_options {
|
struct scanner_options {
|
||||||
file_order_mode file_order{file_order_mode::NONE};
|
file_order_options file_order;
|
||||||
std::optional<uint16_t> uid;
|
std::optional<uint16_t> uid;
|
||||||
std::optional<uint16_t> gid;
|
std::optional<uint16_t> gid;
|
||||||
std::optional<uint64_t> timestamp;
|
std::optional<uint64_t> timestamp;
|
||||||
|
@ -35,6 +35,7 @@
|
|||||||
#include "dwarfs/logger.h"
|
#include "dwarfs/logger.h"
|
||||||
#include "dwarfs/mmif.h"
|
#include "dwarfs/mmif.h"
|
||||||
#include "dwarfs/nilsimsa.h"
|
#include "dwarfs/nilsimsa.h"
|
||||||
|
#include "dwarfs/options.h"
|
||||||
#include "dwarfs/os_access.h"
|
#include "dwarfs/os_access.h"
|
||||||
#include "dwarfs/script.h"
|
#include "dwarfs/script.h"
|
||||||
#include "dwarfs/similarity.h"
|
#include "dwarfs/similarity.h"
|
||||||
@ -159,8 +160,8 @@ class inode_manager_ : public inode_manager::impl {
|
|||||||
|
|
||||||
size_t count() const override { return inodes_.size(); }
|
size_t count() const override { return inodes_.size(); }
|
||||||
|
|
||||||
void order_inodes(std::shared_ptr<script> scr, file_order_mode file_order,
|
void order_inodes(std::shared_ptr<script> scr,
|
||||||
uint32_t first_inode,
|
file_order_options const& file_order, uint32_t first_inode,
|
||||||
inode_manager::inode_cb const& fn) override;
|
inode_manager::inode_cb const& fn) override;
|
||||||
|
|
||||||
void
|
void
|
||||||
@ -213,6 +214,10 @@ class inode_manager_ : public inode_manager::impl {
|
|||||||
void order_inodes_by_nilsimsa(inode_manager::inode_cb const& fn,
|
void order_inodes_by_nilsimsa(inode_manager::inode_cb const& fn,
|
||||||
uint32_t inode_no);
|
uint32_t inode_no);
|
||||||
|
|
||||||
|
void order_inodes_by_nilsimsa2(inode_manager::inode_cb const& fn,
|
||||||
|
uint32_t inode_no,
|
||||||
|
file_order_options const& file_order);
|
||||||
|
|
||||||
void number_inodes(size_t first_no) {
|
void number_inodes(size_t first_no) {
|
||||||
for (auto& i : inodes_) {
|
for (auto& i : inodes_) {
|
||||||
i->set_num(first_no++);
|
i->set_num(first_no++);
|
||||||
@ -225,9 +230,9 @@ class inode_manager_ : public inode_manager::impl {
|
|||||||
|
|
||||||
template <typename LoggerPolicy>
|
template <typename LoggerPolicy>
|
||||||
void inode_manager_<LoggerPolicy>::order_inodes(
|
void inode_manager_<LoggerPolicy>::order_inodes(
|
||||||
std::shared_ptr<script> scr, file_order_mode file_order,
|
std::shared_ptr<script> scr, file_order_options const& file_order,
|
||||||
uint32_t first_inode, inode_manager::inode_cb const& fn) {
|
uint32_t first_inode, inode_manager::inode_cb const& fn) {
|
||||||
switch (file_order) {
|
switch (file_order.mode) {
|
||||||
case file_order_mode::NONE:
|
case file_order_mode::NONE:
|
||||||
log_.info() << "keeping inode order";
|
log_.info() << "keeping inode order";
|
||||||
break;
|
break;
|
||||||
@ -265,15 +270,22 @@ void inode_manager_<LoggerPolicy>::order_inodes(
|
|||||||
auto ti = log_.timed_info();
|
auto ti = log_.timed_info();
|
||||||
order_inodes_by_nilsimsa(fn, first_inode);
|
order_inodes_by_nilsimsa(fn, first_inode);
|
||||||
ti << count() << " inodes ordered";
|
ti << count() << " inodes ordered";
|
||||||
break;
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
case file_order_mode::NILSIMSA2: {
|
||||||
|
log_.info() << "ordering " << count()
|
||||||
|
<< " inodes using nilsimsa2 similarity...";
|
||||||
|
auto ti = log_.timed_info();
|
||||||
|
order_inodes_by_nilsimsa2(fn, first_inode, file_order);
|
||||||
|
ti << count() << " inodes ordered";
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (file_order != file_order_mode::NILSIMSA) {
|
log_.info() << "assigning file inodes...";
|
||||||
log_.info() << "assigning file inodes...";
|
number_inodes(first_inode);
|
||||||
number_inodes(first_inode);
|
for_each_inode(fn);
|
||||||
for_each_inode(fn);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename LoggerPolicy>
|
template <typename LoggerPolicy>
|
||||||
@ -399,6 +411,85 @@ void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename LoggerPolicy>
|
||||||
|
void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa2(
|
||||||
|
inode_manager::inode_cb const& fn, uint32_t inode_no,
|
||||||
|
file_order_options const& file_order) {
|
||||||
|
auto count = inodes_.size();
|
||||||
|
|
||||||
|
std::vector<std::shared_ptr<inode>> inodes;
|
||||||
|
inodes.swap(inodes_);
|
||||||
|
inodes_.reserve(count);
|
||||||
|
std::vector<uint32_t> index;
|
||||||
|
index.resize(count);
|
||||||
|
std::iota(index.begin(), index.end(), 0);
|
||||||
|
|
||||||
|
auto finalize_inode = [&]() {
|
||||||
|
inodes_.push_back(std::move(inodes[index.back()]));
|
||||||
|
index.pop_back();
|
||||||
|
inodes_.back()->set_num(inode_no++);
|
||||||
|
fn(inodes_.back());
|
||||||
|
};
|
||||||
|
|
||||||
|
auto empty = std::partition(index.begin(), index.end(),
|
||||||
|
[&](auto i) { return inodes[i]->size() > 0; });
|
||||||
|
|
||||||
|
if (empty != index.end()) {
|
||||||
|
assert(empty + 1 == index.end());
|
||||||
|
finalize_inode();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!index.empty()) {
|
||||||
|
const int depth = file_order.nilsimsa_depth;
|
||||||
|
const int limit = file_order.nilsimsa_limit;
|
||||||
|
|
||||||
|
log_.info() << "nilsimsa: depth=" << depth << ", limit=" << limit;
|
||||||
|
|
||||||
|
std::sort(index.begin(), index.end(), [&](auto a, auto b) {
|
||||||
|
auto const& ia = *inodes[a];
|
||||||
|
auto const& ib = *inodes[b];
|
||||||
|
return (ia.size() < ib.size() ||
|
||||||
|
(ia.size() == ib.size() && ia.any()->path() < ib.any()->path()));
|
||||||
|
});
|
||||||
|
|
||||||
|
finalize_inode();
|
||||||
|
|
||||||
|
while (!index.empty()) {
|
||||||
|
auto* ref_hash = inodes_.back()->nilsimsa_similarity_hash().data();
|
||||||
|
|
||||||
|
int max_sim = 0;
|
||||||
|
int max_sim_ix = 0;
|
||||||
|
|
||||||
|
int end = int(index.size()) > depth ? index.size() - depth : 0;
|
||||||
|
|
||||||
|
for (int i = index.size() - 1; i >= end; --i) {
|
||||||
|
auto sim = dwarfs::nilsimsa_similarity(
|
||||||
|
ref_hash, inodes[index[i]]->nilsimsa_similarity_hash().data());
|
||||||
|
|
||||||
|
if (sim > max_sim) {
|
||||||
|
max_sim = sim;
|
||||||
|
max_sim_ix = i;
|
||||||
|
|
||||||
|
if (max_sim >= limit) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
log_.trace() << max_sim << " @ " << max_sim_ix << "/" << index.size();
|
||||||
|
|
||||||
|
std::rotate(index.begin() + max_sim_ix, index.begin() + max_sim_ix + 1,
|
||||||
|
index.end());
|
||||||
|
|
||||||
|
finalize_inode();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (count != inodes_.size()) {
|
||||||
|
throw std::runtime_error("internal error: nilsimsa ordering failed");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
inode_manager::inode_manager(logger& lgr)
|
inode_manager::inode_manager(logger& lgr)
|
||||||
: impl_(make_unique_logging_object<impl, inode_manager_, logger_policies>(
|
: impl_(make_unique_logging_object<impl, inode_manager_, logger_policies>(
|
||||||
lgr)) {}
|
lgr)) {}
|
||||||
|
@ -40,7 +40,6 @@
|
|||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
#include <boost/algorithm/string.hpp>
|
#include <boost/algorithm/string.hpp>
|
||||||
#include <boost/any.hpp>
|
|
||||||
#include <boost/program_options.hpp>
|
#include <boost/program_options.hpp>
|
||||||
|
|
||||||
#include <folly/Conv.h>
|
#include <folly/Conv.h>
|
||||||
@ -96,7 +95,8 @@ const std::map<std::string, file_order_mode> order_choices{
|
|||||||
{"script", file_order_mode::SCRIPT},
|
{"script", file_order_mode::SCRIPT},
|
||||||
#endif
|
#endif
|
||||||
{"similarity", file_order_mode::SIMILARITY},
|
{"similarity", file_order_mode::SIMILARITY},
|
||||||
{"nilsimsa", file_order_mode::NILSIMSA}};
|
{"nilsimsa", file_order_mode::NILSIMSA},
|
||||||
|
{"nilsimsa2", file_order_mode::NILSIMSA2}};
|
||||||
|
|
||||||
const std::map<std::string, uint32_t> time_resolutions{
|
const std::map<std::string, uint32_t> time_resolutions{
|
||||||
{"sec", 1},
|
{"sec", 1},
|
||||||
@ -109,20 +109,6 @@ const std::map<std::string, uint32_t> time_resolutions{
|
|||||||
|
|
||||||
namespace dwarfs {
|
namespace dwarfs {
|
||||||
|
|
||||||
void validate(boost::any& v, const std::vector<std::string>& values,
|
|
||||||
file_order_mode*, int) {
|
|
||||||
using namespace boost::program_options;
|
|
||||||
|
|
||||||
validators::check_first_occurrence(v);
|
|
||||||
|
|
||||||
auto it = order_choices.find(validators::get_single_string(values));
|
|
||||||
if (it == order_choices.end()) {
|
|
||||||
throw validation_error(validation_error::invalid_option_value);
|
|
||||||
}
|
|
||||||
|
|
||||||
v = boost::any(it->second);
|
|
||||||
}
|
|
||||||
|
|
||||||
class script_options : public options_interface {
|
class script_options : public options_interface {
|
||||||
public:
|
public:
|
||||||
script_options(logger& lgr, po::variables_map& vm, scanner_options& opts,
|
script_options(logger& lgr, po::variables_map& vm, scanner_options& opts,
|
||||||
@ -133,7 +119,7 @@ class script_options : public options_interface {
|
|||||||
, force_similarity_(force_similarity) {}
|
, force_similarity_(force_similarity) {}
|
||||||
|
|
||||||
void set_order(file_order_mode order_mode, set_mode mode = DEFAULT) override {
|
void set_order(file_order_mode order_mode, set_mode mode = DEFAULT) override {
|
||||||
set(opts_.file_order, order_mode, "order", mode);
|
set(opts_.file_order.mode, order_mode, "order", mode);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
@ -289,7 +275,7 @@ int mkdwarfs(int argc, char** argv) {
|
|||||||
block_manager::config cfg;
|
block_manager::config cfg;
|
||||||
std::string path, output, window_sizes, memory_limit, script_arg, compression,
|
std::string path, output, window_sizes, memory_limit, script_arg, compression,
|
||||||
schema_compression, metadata_compression, log_level, timestamp,
|
schema_compression, metadata_compression, log_level, timestamp,
|
||||||
time_resolution;
|
time_resolution, order;
|
||||||
size_t num_workers, max_scanner_workers;
|
size_t num_workers, max_scanner_workers;
|
||||||
bool recompress = false, no_progress = false;
|
bool recompress = false, no_progress = false;
|
||||||
unsigned level;
|
unsigned level;
|
||||||
@ -356,8 +342,8 @@ int mkdwarfs(int argc, char** argv) {
|
|||||||
po::value<std::string>(&time_resolution)->default_value("sec"),
|
po::value<std::string>(&time_resolution)->default_value("sec"),
|
||||||
resolution_desc.c_str())
|
resolution_desc.c_str())
|
||||||
("order",
|
("order",
|
||||||
po::value<file_order_mode>(&options.file_order)
|
po::value<std::string>(&order)
|
||||||
->default_value(file_order_mode::SIMILARITY, "similarity"),
|
->default_value("similarity"),
|
||||||
order_desc.c_str())
|
order_desc.c_str())
|
||||||
#ifdef DWARFS_HAVE_PYTHON
|
#ifdef DWARFS_HAVE_PYTHON
|
||||||
("script",
|
("script",
|
||||||
@ -490,6 +476,40 @@ int mkdwarfs(int argc, char** argv) {
|
|||||||
window_sizes = defaults.window_sizes;
|
window_sizes = defaults.window_sizes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> order_opts;
|
||||||
|
boost::split(order_opts, order, boost::is_any_of(":"));
|
||||||
|
if (auto it = order_choices.find(order_opts.front());
|
||||||
|
it != order_choices.end()) {
|
||||||
|
options.file_order.mode = it->second;
|
||||||
|
if (order_opts.size() > 1) {
|
||||||
|
if (options.file_order.mode != file_order_mode::NILSIMSA2) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
fmt::format("file order mode '{}' does not support options",
|
||||||
|
order_opts.front()));
|
||||||
|
}
|
||||||
|
if (order_opts.size() > 3) {
|
||||||
|
throw std::runtime_error(fmt::format(
|
||||||
|
"too many options for file order mode '{}'", order_opts.front()));
|
||||||
|
}
|
||||||
|
options.file_order.nilsimsa_limit = folly::to<int>(order_opts[1]);
|
||||||
|
if (options.file_order.nilsimsa_limit < 0 ||
|
||||||
|
options.file_order.nilsimsa_limit > 255) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
fmt::format("limit ({}) out of range for '{}' (0..255)",
|
||||||
|
options.file_order.nilsimsa_limit, order_opts.front()));
|
||||||
|
}
|
||||||
|
if (order_opts.size() > 2) {
|
||||||
|
options.file_order.nilsimsa_depth = folly::to<int>(order_opts[2]);
|
||||||
|
if (options.file_order.nilsimsa_depth < 0) {
|
||||||
|
throw std::runtime_error(fmt::format(
|
||||||
|
"depth ({}) cannot be negative for '{}'", order_opts.front()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
throw std::runtime_error("invalid file order mode: " + order);
|
||||||
|
}
|
||||||
|
|
||||||
size_t mem_limit = parse_size_with_unit(memory_limit);
|
size_t mem_limit = parse_size_with_unit(memory_limit);
|
||||||
|
|
||||||
std::vector<std::string> wsv;
|
std::vector<std::string> wsv;
|
||||||
@ -544,7 +564,7 @@ int mkdwarfs(int argc, char** argv) {
|
|||||||
script->configure(script_opts);
|
script->configure(script_opts);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (options.file_order == file_order_mode::SCRIPT && !script) {
|
if (options.file_order.mode == file_order_mode::SCRIPT && !script) {
|
||||||
throw std::runtime_error(
|
throw std::runtime_error(
|
||||||
"--order=script can only be used with a valid --script option");
|
"--order=script can only be used with a valid --script option");
|
||||||
}
|
}
|
||||||
@ -591,9 +611,11 @@ int mkdwarfs(int argc, char** argv) {
|
|||||||
ti << "filesystem rewritten";
|
ti << "filesystem rewritten";
|
||||||
} else {
|
} else {
|
||||||
options.inode.with_similarity =
|
options.inode.with_similarity =
|
||||||
force_similarity || options.file_order == file_order_mode::SIMILARITY;
|
force_similarity ||
|
||||||
|
options.file_order.mode == file_order_mode::SIMILARITY;
|
||||||
options.inode.with_nilsimsa =
|
options.inode.with_nilsimsa =
|
||||||
options.file_order == file_order_mode::NILSIMSA;
|
options.file_order.mode == file_order_mode::NILSIMSA ||
|
||||||
|
options.file_order.mode == file_order_mode::NILSIMSA2;
|
||||||
|
|
||||||
scanner s(lgr, wg_scanner, cfg, entry_factory::create(),
|
scanner s(lgr, wg_scanner, cfg, entry_factory::create(),
|
||||||
std::make_shared<os_access_posix>(), std::move(script), options);
|
std::make_shared<os_access_posix>(), std::move(script), options);
|
||||||
|
@ -199,7 +199,7 @@ void basic_end_to_end_test(std::string const& compressor,
|
|||||||
cfg.blockhash_window_size.push_back(1 << 10);
|
cfg.blockhash_window_size.push_back(1 << 10);
|
||||||
cfg.block_size_bits = block_size_bits;
|
cfg.block_size_bits = block_size_bits;
|
||||||
|
|
||||||
options.file_order = file_order;
|
options.file_order.mode = file_order;
|
||||||
options.with_devices = with_devices;
|
options.with_devices = with_devices;
|
||||||
options.with_specials = with_specials;
|
options.with_specials = with_specials;
|
||||||
options.inode.with_similarity = file_order == file_order_mode::SIMILARITY;
|
options.inode.with_similarity = file_order == file_order_mode::SIMILARITY;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user