Dynamically adapt nilsimsa search depth

This commit is contained in:
Marcus Holland-Moritz 2020-12-30 13:24:50 +01:00
parent 6a328999be
commit 201b6ddb3b
11 changed files with 152 additions and 56 deletions

View File

@ -154,7 +154,7 @@ Most other options are concerned with compression tuning:
the `mtime` field in order to save metadata space. If you want to save
`atime` and `ctime` as well, use this option.
* `--order=none`|`path`|`similarity`|`nilsimsa`[`:`*limit*[`:`*depth*]]|`script`:
* `--order=none`|`path`|`similarity`|`nilsimsa`[`:`*limit*[`:`*depth*[`:`*mindepth*]]]|`script`:
The order in which inodes will be written to the file system. Choosing `none`,
the inodes will be stored in the order in which they are discovered. With
`path`, they will be sorted asciibetically by path name of the first file
@ -168,9 +168,15 @@ Most other options are concerned with compression tuning:
enough for adding. A *limit* of 255 means "essentially identical", whereas
a *limit* of 0 means "not similar at all". The *depth* determines up to
how many inodes can be checked at most while searching for a similar one.
The default if you omit these values is a *limit* of 255 and a *depth*
of 20000. Last but not least, if scripting support is built into `mkdwarfs`,
you can choose `script` to let the script determine the order.
To avoid nilsimsa ordering to become a bottleneck when ordering lots of
small files, the *depth* is adjusted dynamically to keep the input queue
to the segmentation/compression stages adequately filled. You can specify
how much the *depth* can be adjusted by also specifying *mindepth*.
The default if you omit these values is a *limit* of 255, a *depth*
of 20000 and a *mindepth* of 1000. Note that if you want reproducible
results, you need to set *depth* and *mindepth* to the same value.
Last but not least, if scripting support is built into `mkdwarfs`, you can
choose `script` to let the script determine the order.
* `--blockhash-window-sizes=`*value*[,*value*]...:
Window sizes used for block hashing. These sizes, separated by commas,

View File

@ -61,5 +61,6 @@ class console_writer : public logger {
display_mode const mode_;
bool const color_;
bool const with_context_;
bool const debug_progress_;
};
} // namespace dwarfs

View File

@ -87,6 +87,8 @@ class filesystem_writer {
size_t size() const { return impl_->size(); }
int queue_fill() const { return impl_->queue_fill(); }
class impl {
public:
virtual ~impl() = default;
@ -99,6 +101,7 @@ class filesystem_writer {
folly::ByteRange data) = 0;
virtual void flush() = 0;
virtual size_t size() const = 0;
virtual int queue_fill() const = 0;
};
private:

View File

@ -29,6 +29,7 @@ namespace dwarfs {
class inode;
class logger;
class progress;
class script;
struct file_order_options;
@ -36,8 +37,9 @@ struct file_order_options;
class inode_manager {
public:
using inode_cb = std::function<void(std::shared_ptr<inode> const&)>;
using order_cb = std::function<int64_t(std::shared_ptr<inode> const&)>;
inode_manager(logger& lgr);
inode_manager(logger& lgr, progress& prog);
std::shared_ptr<inode> create_inode() { return impl_->create_inode(); }
@ -45,7 +47,7 @@ class inode_manager {
void order_inodes(std::shared_ptr<script> scr,
file_order_options const& file_order, uint32_t first_inode,
inode_cb const& fn) {
order_cb const& fn) {
impl_->order_inodes(std::move(scr), file_order, first_inode, fn);
}
@ -59,7 +61,7 @@ class inode_manager {
virtual size_t count() const = 0;
virtual void order_inodes(std::shared_ptr<script> scr,
file_order_options const& file_order,
uint32_t first_inode, inode_cb const& fn) = 0;
uint32_t first_inode, order_cb const& fn) = 0;
virtual void for_each_inode(
std::function<void(std::shared_ptr<inode> const&)> const& fn) const = 0;
};

View File

@ -59,6 +59,7 @@ enum class file_order_mode { NONE, PATH, SCRIPT, SIMILARITY, NILSIMSA };
struct file_order_options {
file_order_mode mode{file_order_mode::NONE};
int nilsimsa_depth{20000};
int nilsimsa_min_depth{1000};
int nilsimsa_limit{255};
};

View File

@ -70,6 +70,9 @@ class progress {
std::atomic<size_t> inodes_written{0};
std::atomic<size_t> blocks_written{0};
std::atomic<size_t> errors{0};
std::atomic<size_t> nilsimsa_depth{0};
std::atomic<size_t> blockify_queue{0};
std::atomic<size_t> compress_queue{0};
std::atomic<uint64_t> original_size{0};
std::atomic<uint64_t> hardlink_size{0};
std::atomic<uint64_t> saved_by_deduplication{0};

View File

@ -19,6 +19,7 @@
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/
#include <cstdlib>
#include <cstring>
#include <locale>
#include <sstream>
@ -44,6 +45,15 @@ namespace {
char const* const asc_bar[8] = {"=", "=", "=", "=", "=", "=", "=", "="};
char const* const uni_bar[8] = {"", "", "", "", "", "", "", ""};
bool is_debug_progress() {
if (auto var = ::getenv("DWARFS_DEBUG_PROGRESS")) {
if (auto val = folly::tryTo<bool>(var)) {
return *val;
}
}
return false;
}
} // namespace
console_writer::console_writer(std::ostream& os, progress_mode pg_mode,
@ -56,7 +66,8 @@ console_writer::console_writer(std::ostream& os, progress_mode pg_mode,
, width_(width)
, mode_(mode)
, color_(stream_is_fancy_terminal(os))
, with_context_(with_context) {
, with_context_(with_context)
, debug_progress_(is_debug_progress()) {
os_.imbue(std::locale(os_.getloc(),
new boost::posix_time::time_facet("%H:%M:%S.%f")));
if (threshold > level_type::INFO) {
@ -176,7 +187,18 @@ void console_writer::update(const progress& p, bool last) {
<< p.files_found - p.duplicate_files << " inodes)" << newline
<< "compressed filesystem: " << p.blocks_written << " blocks/"
<< size_with_unit(p.compressed_size) << " written" << newline;
<< size_with_unit(p.compressed_size) << " written";
if (debug_progress_) {
oss << " [" << p.nilsimsa_depth << "/" << p.blockify_queue << "/"
<< p.compress_queue << "]";
} else {
if (p.nilsimsa_depth > 0) {
oss << " [depth: " << p.nilsimsa_depth << "]";
}
}
oss << newline;
break;
case REWRITE:

View File

@ -218,6 +218,7 @@ class filesystem_writer_ : public filesystem_writer::impl {
folly::ByteRange data) override;
void flush() override;
size_t size() const override { return os_.tellp(); }
int queue_fill() const override { return static_cast<int>(wg_.queue_size()); }
private:
void write_section(section_type type, std::vector<uint8_t>&& data,

View File

@ -37,6 +37,7 @@
#include "dwarfs/nilsimsa.h"
#include "dwarfs/options.h"
#include "dwarfs/os_access.h"
#include "dwarfs/progress.h"
#include "dwarfs/script.h"
#include "dwarfs/similarity.h"
@ -149,8 +150,9 @@ class nilsimsa_cache_entry {
template <typename LoggerPolicy>
class inode_manager_ : public inode_manager::impl {
public:
inode_manager_(logger& lgr)
: log_(lgr) {}
inode_manager_(logger& lgr, progress& prog)
: LOG_PROXY_INIT(lgr)
, prog_(prog) {}
std::shared_ptr<inode> create_inode() override {
auto ino = std::make_shared<inode_>();
@ -162,7 +164,7 @@ class inode_manager_ : public inode_manager::impl {
void order_inodes(std::shared_ptr<script> scr,
file_order_options const& file_order, uint32_t first_inode,
inode_manager::inode_cb const& fn) override;
inode_manager::order_cb const& fn) override;
void
for_each_inode(std::function<void(std::shared_ptr<inode> const&)> const& fn)
@ -215,7 +217,7 @@ class inode_manager_ : public inode_manager::impl {
std::vector<uint32_t>& index);
void
order_inodes_by_nilsimsa(inode_manager::inode_cb const& fn, uint32_t inode_no,
order_inodes_by_nilsimsa(inode_manager::order_cb const& fn, uint32_t inode_no,
file_order_options const& file_order);
void number_inodes(size_t first_no) {
@ -225,13 +227,14 @@ class inode_manager_ : public inode_manager::impl {
}
std::vector<std::shared_ptr<inode>> inodes_;
log_proxy<LoggerPolicy> log_;
LOG_PROXY_DECL(LoggerPolicy);
progress& prog_;
};
template <typename LoggerPolicy>
void inode_manager_<LoggerPolicy>::order_inodes(
std::shared_ptr<script> scr, file_order_options const& file_order,
uint32_t first_inode, inode_manager::inode_cb const& fn) {
uint32_t first_inode, inode_manager::order_cb const& fn) {
switch (file_order.mode) {
case file_order_mode::NONE:
LOG_INFO << "keeping inode order";
@ -276,7 +279,9 @@ void inode_manager_<LoggerPolicy>::order_inodes(
LOG_INFO << "assigning file inodes...";
number_inodes(first_inode);
for_each_inode(fn);
for (const auto& ino : inodes_) {
fn(ino);
}
}
template <typename LoggerPolicy>
@ -322,7 +327,7 @@ void inode_manager_<LoggerPolicy>::presort_index(
template <typename LoggerPolicy>
void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
inode_manager::inode_cb const& fn, uint32_t inode_no,
inode_manager::order_cb const& fn, uint32_t inode_no,
file_order_options const& file_order) {
auto count = inodes_.size();
@ -340,7 +345,7 @@ void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
inodes_.push_back(std::move(inodes[index.back()]));
index.pop_back();
inodes_.back()->set_num(inode_no++);
fn(inodes_.back());
return fn(inodes_.back());
};
if (empty != index.end()) {
@ -349,10 +354,15 @@ void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
}
if (!index.empty()) {
const int depth = file_order.nilsimsa_depth;
const int limit = file_order.nilsimsa_limit;
const int_fast32_t max_depth = file_order.nilsimsa_depth;
const int_fast32_t min_depth =
std::min<int32_t>(file_order.nilsimsa_min_depth, max_depth);
const int_fast32_t limit = file_order.nilsimsa_limit;
int_fast32_t depth = max_depth;
int64_t processed = 0;
LOG_INFO << "nilsimsa: depth=" << depth << ", limit=" << limit;
LOG_INFO << "nilsimsa: depth=" << depth << " (" << min_depth
<< "), limit=" << limit;
presort_index(inodes, index);
@ -361,12 +371,12 @@ void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
while (!index.empty()) {
auto* ref_hash = inodes_.back()->nilsimsa_similarity_hash().data();
int max_sim = 0;
int max_sim_ix = 0;
int_fast32_t max_sim = 0;
int_fast32_t max_sim_ix = 0;
int end = int(index.size()) > depth ? index.size() - depth : 0;
int_fast32_t end = int(index.size()) > depth ? index.size() - depth : 0;
for (int i = index.size() - 1; i >= end; --i) {
for (int_fast32_t i = index.size() - 1; i >= end; --i) {
auto sim = dwarfs::nilsimsa_similarity(
ref_hash, inodes[index[i]]->nilsimsa_similarity_hash().data());
@ -385,7 +395,22 @@ void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
std::rotate(index.begin() + max_sim_ix, index.begin() + max_sim_ix + 1,
index.end());
finalize_inode();
auto fill = finalize_inode();
if (++processed >= 4096 && processed % 32 == 0) {
constexpr int64_t smooth = 512;
auto target_depth = fill * max_depth / 2048;
depth = ((smooth - 1) * depth + target_depth) / smooth;
if (depth > max_depth) {
depth = max_depth;
} else if (depth < min_depth) {
depth = min_depth;
}
}
prog_.nilsimsa_depth = depth;
}
}
@ -394,8 +419,8 @@ void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
}
}
inode_manager::inode_manager(logger& lgr)
inode_manager::inode_manager(logger& lgr, progress& prog)
: impl_(make_unique_logging_object<impl, inode_manager_, logger_policies>(
lgr)) {}
lgr, prog)) {}
} // namespace dwarfs

View File

@ -485,7 +485,7 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
LOG_INFO << "finding duplicate files...";
inode_manager im(lgr_);
inode_manager im(lgr_, prog);
file_deduplication_visitor fdv;
root->accept(fdv);
@ -540,7 +540,7 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
LOG_INFO << "building blocks...";
block_manager bm(lgr_, prog, cfg_, os_, fsw);
worker_group blockify("blockify");
worker_group blockify("blockify", 1, 1 << 20);
im.order_inodes(script_, options_.file_order, first_file_inode,
[&](std::shared_ptr<inode> const& ino) {
@ -549,6 +549,12 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
bm.add_inode(ino);
prog.inodes_written++;
});
auto queued_files = blockify.queue_size();
auto queued_blocks = fsw.queue_fill();
prog.blockify_queue = queued_files;
prog.compress_queue = queued_blocks;
return INT64_C(500) * queued_blocks +
static_cast<int64_t>(queued_files);
});
LOG_INFO << "waiting for segmenting/blockifying to finish...";

View File

@ -30,6 +30,7 @@
#include <iterator>
#include <map>
#include <memory>
#include <optional>
#include <sstream>
#include <stdexcept>
#include <string>
@ -171,6 +172,40 @@ class script_options : public options_interface {
namespace {
int parse_order_option(std::string const& ordname, std::string const& opt,
int& value, std::string_view name,
std::optional<int> min = std::nullopt,
std::optional<int> max = std::nullopt) {
if (!opt.empty()) {
if (auto val = folly::tryTo<int>(opt)) {
auto tmp = *val;
if (min && max && (tmp < *min || tmp > *max)) {
std::cerr << "error: " << name << " (" << opt
<< ") out of range for order '" << ordname << "' (" << *min
<< ".." << *max << ")" << std::endl;
return 1;
}
if (min && tmp < *min) {
std::cerr << "error: " << name << " (" << opt
<< ") cannot be less than " << *min << " for order '"
<< ordname << "'" << std::endl;
}
if (max && tmp > *max) {
std::cerr << "error: " << name << " (" << opt
<< ") cannot be greater than " << *max << " for order '"
<< ordname << "'" << std::endl;
}
value = tmp;
} else {
std::cerr << "error: " << name << " (" << opt
<< ") is not numeric for order '" << ordname << "'"
<< std::endl;
return 1;
}
}
return 0;
}
size_t get_term_width() {
struct ::winsize w;
::ioctl(STDOUT_FILENO, TIOCGWINSZ, &w);
@ -281,7 +316,7 @@ constexpr std::array<level_defaults, 10> levels{{
/* 6 */ {24, ALG_DATA_6, ALG_SCHEMA, "null", "16,14,12", "nilsimsa"},
/* 7 */ {24, ALG_DATA_7, ALG_SCHEMA, ALG_METADATA_7, "16,14,12", "nilsimsa"},
/* 8 */ {24, ALG_DATA_8, ALG_SCHEMA, ALG_METADATA_9, "16,14,12", "nilsimsa"},
/* 9 */ {26, ALG_DATA_9, ALG_SCHEMA, ALG_METADATA_9, "16,14,12", "nilsimsa:255:50000"},
/* 9 */ {26, ALG_DATA_9, ALG_SCHEMA, ALG_METADATA_9, "16,14,12", "nilsimsa::50000"},
// clang-format on
}};
@ -548,40 +583,31 @@ int mkdwarfs(int argc, char** argv) {
return 1;
}
if (order_opts.size() > 3) {
if (order_opts.size() > 4) {
std::cerr << "error: too many options for inode order mode '"
<< order_opts[0] << "'" << std::endl;
return 1;
}
if (auto val = folly::tryTo<int>(order_opts[1])) {
options.file_order.nilsimsa_limit = *val;
if (options.file_order.nilsimsa_limit < 0 ||
options.file_order.nilsimsa_limit > 255) {
std::cerr << "error: limit (" << order_opts[1]
<< ") out of range for order '" << order_opts[0]
<< "' (0..255)" << std::endl;
return 1;
}
} else {
std::cerr << "error: limit (" << order_opts[1]
<< ") is not numeric for order '" << order_opts[0] << "'"
<< std::endl;
auto ordname = order_opts[0];
if (parse_order_option(ordname, order_opts[1],
options.file_order.nilsimsa_limit, "limit", 0,
255)) {
return 1;
}
if (order_opts.size() > 2) {
if (auto val = folly::tryTo<int>(order_opts[2])) {
options.file_order.nilsimsa_depth = *val;
if (options.file_order.nilsimsa_depth < 0) {
std::cerr << "error: depth (" << order_opts[2]
<< ") cannot be negative for order '" << order_opts[0]
<< "'" << std::endl;
if (parse_order_option(ordname, order_opts[2],
options.file_order.nilsimsa_depth, "depth", 0)) {
return 1;
}
} else {
std::cerr << "error: depth (" << order_opts[2]
<< ") is not numeric for order '" << order_opts[0] << "'"
<< std::endl;
}
if (order_opts.size() > 3) {
if (parse_order_option(ordname, order_opts[3],
options.file_order.nilsimsa_min_depth,
"min depth", 0)) {
return 1;
}
}