Add nilsimsa2 order mode (will replace nilsimsa after more testing)

This commit is contained in:
Marcus Holland-Moritz 2020-12-09 09:02:17 +01:00
parent f8164c9e3a
commit 14aed67ade
5 changed files with 170 additions and 43 deletions

View File

@ -25,14 +25,14 @@
#include <functional>
#include <memory>
#include "dwarfs/options.h"
namespace dwarfs {
class inode;
class logger;
class script;
struct file_order_options;
class inode_manager {
public:
using inode_cb = std::function<void(std::shared_ptr<inode> const&)>;
@ -43,8 +43,9 @@ class inode_manager {
size_t count() const { return impl_->count(); }
void order_inodes(std::shared_ptr<script> scr, file_order_mode file_order,
uint32_t first_inode, inode_cb const& fn) {
void order_inodes(std::shared_ptr<script> scr,
file_order_options const& file_order, uint32_t first_inode,
inode_cb const& fn) {
impl_->order_inodes(std::move(scr), file_order, first_inode, fn);
}
@ -56,9 +57,9 @@ class inode_manager {
virtual std::shared_ptr<inode> create_inode() = 0;
virtual size_t count() const = 0;
virtual void
order_inodes(std::shared_ptr<script> scr, file_order_mode file_order,
uint32_t first_inode, inode_cb const& fn) = 0;
virtual void order_inodes(std::shared_ptr<script> scr,
file_order_options const& file_order,
uint32_t first_inode, inode_cb const& fn) = 0;
virtual void for_each_inode(
std::function<void(std::shared_ptr<inode> const&)> const& fn) const = 0;
};

View File

@ -54,10 +54,23 @@ struct inode_options {
bool needs_scan() const { return with_similarity || with_nilsimsa; }
};
enum class file_order_mode { NONE, PATH, SCRIPT, SIMILARITY, NILSIMSA };
enum class file_order_mode {
NONE,
PATH,
SCRIPT,
SIMILARITY,
NILSIMSA,
NILSIMSA2
};
struct file_order_options {
file_order_mode mode{file_order_mode::NONE};
int nilsimsa_depth{10000};
int nilsimsa_limit{250};
};
struct scanner_options {
file_order_mode file_order{file_order_mode::NONE};
file_order_options file_order;
std::optional<uint16_t> uid;
std::optional<uint16_t> gid;
std::optional<uint64_t> timestamp;

View File

@ -35,6 +35,7 @@
#include "dwarfs/logger.h"
#include "dwarfs/mmif.h"
#include "dwarfs/nilsimsa.h"
#include "dwarfs/options.h"
#include "dwarfs/os_access.h"
#include "dwarfs/script.h"
#include "dwarfs/similarity.h"
@ -159,8 +160,8 @@ class inode_manager_ : public inode_manager::impl {
size_t count() const override { return inodes_.size(); }
void order_inodes(std::shared_ptr<script> scr, file_order_mode file_order,
uint32_t first_inode,
void order_inodes(std::shared_ptr<script> scr,
file_order_options const& file_order, uint32_t first_inode,
inode_manager::inode_cb const& fn) override;
void
@ -213,6 +214,10 @@ class inode_manager_ : public inode_manager::impl {
void order_inodes_by_nilsimsa(inode_manager::inode_cb const& fn,
uint32_t inode_no);
void order_inodes_by_nilsimsa2(inode_manager::inode_cb const& fn,
uint32_t inode_no,
file_order_options const& file_order);
void number_inodes(size_t first_no) {
for (auto& i : inodes_) {
i->set_num(first_no++);
@ -225,9 +230,9 @@ class inode_manager_ : public inode_manager::impl {
template <typename LoggerPolicy>
void inode_manager_<LoggerPolicy>::order_inodes(
std::shared_ptr<script> scr, file_order_mode file_order,
std::shared_ptr<script> scr, file_order_options const& file_order,
uint32_t first_inode, inode_manager::inode_cb const& fn) {
switch (file_order) {
switch (file_order.mode) {
case file_order_mode::NONE:
log_.info() << "keeping inode order";
break;
@ -265,15 +270,22 @@ void inode_manager_<LoggerPolicy>::order_inodes(
auto ti = log_.timed_info();
order_inodes_by_nilsimsa(fn, first_inode);
ti << count() << " inodes ordered";
break;
return;
}
case file_order_mode::NILSIMSA2: {
log_.info() << "ordering " << count()
<< " inodes using nilsimsa2 similarity...";
auto ti = log_.timed_info();
order_inodes_by_nilsimsa2(fn, first_inode, file_order);
ti << count() << " inodes ordered";
return;
}
}
if (file_order != file_order_mode::NILSIMSA) {
log_.info() << "assigning file inodes...";
number_inodes(first_inode);
for_each_inode(fn);
}
log_.info() << "assigning file inodes...";
number_inodes(first_inode);
for_each_inode(fn);
}
template <typename LoggerPolicy>
@ -399,6 +411,85 @@ void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
}
}
template <typename LoggerPolicy>
void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa2(
inode_manager::inode_cb const& fn, uint32_t inode_no,
file_order_options const& file_order) {
auto count = inodes_.size();
std::vector<std::shared_ptr<inode>> inodes;
inodes.swap(inodes_);
inodes_.reserve(count);
std::vector<uint32_t> index;
index.resize(count);
std::iota(index.begin(), index.end(), 0);
auto finalize_inode = [&]() {
inodes_.push_back(std::move(inodes[index.back()]));
index.pop_back();
inodes_.back()->set_num(inode_no++);
fn(inodes_.back());
};
auto empty = std::partition(index.begin(), index.end(),
[&](auto i) { return inodes[i]->size() > 0; });
if (empty != index.end()) {
assert(empty + 1 == index.end());
finalize_inode();
}
if (!index.empty()) {
const int depth = file_order.nilsimsa_depth;
const int limit = file_order.nilsimsa_limit;
log_.info() << "nilsimsa: depth=" << depth << ", limit=" << limit;
std::sort(index.begin(), index.end(), [&](auto a, auto b) {
auto const& ia = *inodes[a];
auto const& ib = *inodes[b];
return (ia.size() < ib.size() ||
(ia.size() == ib.size() && ia.any()->path() < ib.any()->path()));
});
finalize_inode();
while (!index.empty()) {
auto* ref_hash = inodes_.back()->nilsimsa_similarity_hash().data();
int max_sim = 0;
int max_sim_ix = 0;
int end = int(index.size()) > depth ? index.size() - depth : 0;
for (int i = index.size() - 1; i >= end; --i) {
auto sim = dwarfs::nilsimsa_similarity(
ref_hash, inodes[index[i]]->nilsimsa_similarity_hash().data());
if (sim > max_sim) {
max_sim = sim;
max_sim_ix = i;
if (max_sim >= limit) {
break;
}
}
}
log_.trace() << max_sim << " @ " << max_sim_ix << "/" << index.size();
std::rotate(index.begin() + max_sim_ix, index.begin() + max_sim_ix + 1,
index.end());
finalize_inode();
}
}
if (count != inodes_.size()) {
throw std::runtime_error("internal error: nilsimsa ordering failed");
}
}
inode_manager::inode_manager(logger& lgr)
: impl_(make_unique_logging_object<impl, inode_manager_, logger_policies>(
lgr)) {}

View File

@ -40,7 +40,6 @@
#include <unistd.h>
#include <boost/algorithm/string.hpp>
#include <boost/any.hpp>
#include <boost/program_options.hpp>
#include <folly/Conv.h>
@ -96,7 +95,8 @@ const std::map<std::string, file_order_mode> order_choices{
{"script", file_order_mode::SCRIPT},
#endif
{"similarity", file_order_mode::SIMILARITY},
{"nilsimsa", file_order_mode::NILSIMSA}};
{"nilsimsa", file_order_mode::NILSIMSA},
{"nilsimsa2", file_order_mode::NILSIMSA2}};
const std::map<std::string, uint32_t> time_resolutions{
{"sec", 1},
@ -109,20 +109,6 @@ const std::map<std::string, uint32_t> time_resolutions{
namespace dwarfs {
void validate(boost::any& v, const std::vector<std::string>& values,
file_order_mode*, int) {
using namespace boost::program_options;
validators::check_first_occurrence(v);
auto it = order_choices.find(validators::get_single_string(values));
if (it == order_choices.end()) {
throw validation_error(validation_error::invalid_option_value);
}
v = boost::any(it->second);
}
class script_options : public options_interface {
public:
script_options(logger& lgr, po::variables_map& vm, scanner_options& opts,
@ -133,7 +119,7 @@ class script_options : public options_interface {
, force_similarity_(force_similarity) {}
void set_order(file_order_mode order_mode, set_mode mode = DEFAULT) override {
set(opts_.file_order, order_mode, "order", mode);
set(opts_.file_order.mode, order_mode, "order", mode);
}
void
@ -289,7 +275,7 @@ int mkdwarfs(int argc, char** argv) {
block_manager::config cfg;
std::string path, output, window_sizes, memory_limit, script_arg, compression,
schema_compression, metadata_compression, log_level, timestamp,
time_resolution;
time_resolution, order;
size_t num_workers, max_scanner_workers;
bool recompress = false, no_progress = false;
unsigned level;
@ -356,8 +342,8 @@ int mkdwarfs(int argc, char** argv) {
po::value<std::string>(&time_resolution)->default_value("sec"),
resolution_desc.c_str())
("order",
po::value<file_order_mode>(&options.file_order)
->default_value(file_order_mode::SIMILARITY, "similarity"),
po::value<std::string>(&order)
->default_value("similarity"),
order_desc.c_str())
#ifdef DWARFS_HAVE_PYTHON
("script",
@ -490,6 +476,40 @@ int mkdwarfs(int argc, char** argv) {
window_sizes = defaults.window_sizes;
}
std::vector<std::string> order_opts;
boost::split(order_opts, order, boost::is_any_of(":"));
if (auto it = order_choices.find(order_opts.front());
it != order_choices.end()) {
options.file_order.mode = it->second;
if (order_opts.size() > 1) {
if (options.file_order.mode != file_order_mode::NILSIMSA2) {
throw std::runtime_error(
fmt::format("file order mode '{}' does not support options",
order_opts.front()));
}
if (order_opts.size() > 3) {
throw std::runtime_error(fmt::format(
"too many options for file order mode '{}'", order_opts.front()));
}
options.file_order.nilsimsa_limit = folly::to<int>(order_opts[1]);
if (options.file_order.nilsimsa_limit < 0 ||
options.file_order.nilsimsa_limit > 255) {
throw std::runtime_error(
fmt::format("limit ({}) out of range for '{}' (0..255)",
options.file_order.nilsimsa_limit, order_opts.front()));
}
if (order_opts.size() > 2) {
options.file_order.nilsimsa_depth = folly::to<int>(order_opts[2]);
if (options.file_order.nilsimsa_depth < 0) {
throw std::runtime_error(fmt::format(
"depth ({}) cannot be negative for '{}'", order_opts.front()));
}
}
}
} else {
throw std::runtime_error("invalid file order mode: " + order);
}
size_t mem_limit = parse_size_with_unit(memory_limit);
std::vector<std::string> wsv;
@ -544,7 +564,7 @@ int mkdwarfs(int argc, char** argv) {
script->configure(script_opts);
}
if (options.file_order == file_order_mode::SCRIPT && !script) {
if (options.file_order.mode == file_order_mode::SCRIPT && !script) {
throw std::runtime_error(
"--order=script can only be used with a valid --script option");
}
@ -591,9 +611,11 @@ int mkdwarfs(int argc, char** argv) {
ti << "filesystem rewritten";
} else {
options.inode.with_similarity =
force_similarity || options.file_order == file_order_mode::SIMILARITY;
force_similarity ||
options.file_order.mode == file_order_mode::SIMILARITY;
options.inode.with_nilsimsa =
options.file_order == file_order_mode::NILSIMSA;
options.file_order.mode == file_order_mode::NILSIMSA ||
options.file_order.mode == file_order_mode::NILSIMSA2;
scanner s(lgr, wg_scanner, cfg, entry_factory::create(),
std::make_shared<os_access_posix>(), std::move(script), options);

View File

@ -199,7 +199,7 @@ void basic_end_to_end_test(std::string const& compressor,
cfg.blockhash_window_size.push_back(1 << 10);
cfg.block_size_bits = block_size_bits;
options.file_order = file_order;
options.file_order.mode = file_order;
options.with_devices = with_devices;
options.with_specials = with_specials;
options.inode.with_similarity = file_order == file_order_mode::SIMILARITY;