Dynamically adapt nilsimsa search depth

This commit is contained in:
Marcus Holland-Moritz 2020-12-30 13:24:50 +01:00
parent 6a328999be
commit 201b6ddb3b
11 changed files with 152 additions and 56 deletions

View File

@ -154,7 +154,7 @@ Most other options are concerned with compression tuning:
the `mtime` field in order to save metadata space. If you want to save the `mtime` field in order to save metadata space. If you want to save
`atime` and `ctime` as well, use this option. `atime` and `ctime` as well, use this option.
* `--order=none`|`path`|`similarity`|`nilsimsa`[`:`*limit*[`:`*depth*]]|`script`: * `--order=none`|`path`|`similarity`|`nilsimsa`[`:`*limit*[`:`*depth*[`:`*mindepth*]]]|`script`:
The order in which inodes will be written to the file system. Choosing `none`, The order in which inodes will be written to the file system. Choosing `none`,
the inodes will be stored in the order in which they are discovered. With the inodes will be stored in the order in which they are discovered. With
`path`, they will be sorted asciibetically by path name of the first file `path`, they will be sorted asciibetically by path name of the first file
@ -168,9 +168,15 @@ Most other options are concerned with compression tuning:
enough for adding. A *limit* of 255 means "essentially identical", whereas enough for adding. A *limit* of 255 means "essentially identical", whereas
a *limit* of 0 means "not similar at all". The *depth* determines up to a *limit* of 0 means "not similar at all". The *depth* determines up to
how many inodes can be checked at most while searching for a similar one. how many inodes can be checked at most while searching for a similar one.
The default if you omit these values is a *limit* of 255 and a *depth* To avoid nilsimsa ordering to become a bottleneck when ordering lots of
of 20000. Last but not least, if scripting support is built into `mkdwarfs`, small files, the *depth* is adjusted dynamically to keep the input queue
you can choose `script` to let the script determine the order. to the segmentation/compression stages adequately filled. You can specify
how much the *depth* can be adjusted by also specifying *mindepth*.
The default if you omit these values is a *limit* of 255, a *depth*
of 20000 and a *mindepth* of 1000. Note that if you want reproducible
results, you need to set *depth* and *mindepth* to the same value.
Last but not least, if scripting support is built into `mkdwarfs`, you can
choose `script` to let the script determine the order.
* `--blockhash-window-sizes=`*value*[,*value*]...: * `--blockhash-window-sizes=`*value*[,*value*]...:
Window sizes used for block hashing. These sizes, separated by commas, Window sizes used for block hashing. These sizes, separated by commas,

View File

@ -61,5 +61,6 @@ class console_writer : public logger {
display_mode const mode_; display_mode const mode_;
bool const color_; bool const color_;
bool const with_context_; bool const with_context_;
bool const debug_progress_;
}; };
} // namespace dwarfs } // namespace dwarfs

View File

@ -87,6 +87,8 @@ class filesystem_writer {
size_t size() const { return impl_->size(); } size_t size() const { return impl_->size(); }
int queue_fill() const { return impl_->queue_fill(); }
class impl { class impl {
public: public:
virtual ~impl() = default; virtual ~impl() = default;
@ -99,6 +101,7 @@ class filesystem_writer {
folly::ByteRange data) = 0; folly::ByteRange data) = 0;
virtual void flush() = 0; virtual void flush() = 0;
virtual size_t size() const = 0; virtual size_t size() const = 0;
virtual int queue_fill() const = 0;
}; };
private: private:

View File

@ -29,6 +29,7 @@ namespace dwarfs {
class inode; class inode;
class logger; class logger;
class progress;
class script; class script;
struct file_order_options; struct file_order_options;
@ -36,8 +37,9 @@ struct file_order_options;
class inode_manager { class inode_manager {
public: public:
using inode_cb = std::function<void(std::shared_ptr<inode> const&)>; using inode_cb = std::function<void(std::shared_ptr<inode> const&)>;
using order_cb = std::function<int64_t(std::shared_ptr<inode> const&)>;
inode_manager(logger& lgr); inode_manager(logger& lgr, progress& prog);
std::shared_ptr<inode> create_inode() { return impl_->create_inode(); } std::shared_ptr<inode> create_inode() { return impl_->create_inode(); }
@ -45,7 +47,7 @@ class inode_manager {
void order_inodes(std::shared_ptr<script> scr, void order_inodes(std::shared_ptr<script> scr,
file_order_options const& file_order, uint32_t first_inode, file_order_options const& file_order, uint32_t first_inode,
inode_cb const& fn) { order_cb const& fn) {
impl_->order_inodes(std::move(scr), file_order, first_inode, fn); impl_->order_inodes(std::move(scr), file_order, first_inode, fn);
} }
@ -59,7 +61,7 @@ class inode_manager {
virtual size_t count() const = 0; virtual size_t count() const = 0;
virtual void order_inodes(std::shared_ptr<script> scr, virtual void order_inodes(std::shared_ptr<script> scr,
file_order_options const& file_order, file_order_options const& file_order,
uint32_t first_inode, inode_cb const& fn) = 0; uint32_t first_inode, order_cb const& fn) = 0;
virtual void for_each_inode( virtual void for_each_inode(
std::function<void(std::shared_ptr<inode> const&)> const& fn) const = 0; std::function<void(std::shared_ptr<inode> const&)> const& fn) const = 0;
}; };

View File

@ -59,6 +59,7 @@ enum class file_order_mode { NONE, PATH, SCRIPT, SIMILARITY, NILSIMSA };
struct file_order_options { struct file_order_options {
file_order_mode mode{file_order_mode::NONE}; file_order_mode mode{file_order_mode::NONE};
int nilsimsa_depth{20000}; int nilsimsa_depth{20000};
int nilsimsa_min_depth{1000};
int nilsimsa_limit{255}; int nilsimsa_limit{255};
}; };

View File

@ -70,6 +70,9 @@ class progress {
std::atomic<size_t> inodes_written{0}; std::atomic<size_t> inodes_written{0};
std::atomic<size_t> blocks_written{0}; std::atomic<size_t> blocks_written{0};
std::atomic<size_t> errors{0}; std::atomic<size_t> errors{0};
std::atomic<size_t> nilsimsa_depth{0};
std::atomic<size_t> blockify_queue{0};
std::atomic<size_t> compress_queue{0};
std::atomic<uint64_t> original_size{0}; std::atomic<uint64_t> original_size{0};
std::atomic<uint64_t> hardlink_size{0}; std::atomic<uint64_t> hardlink_size{0};
std::atomic<uint64_t> saved_by_deduplication{0}; std::atomic<uint64_t> saved_by_deduplication{0};

View File

@ -19,6 +19,7 @@
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>. * along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/ */
#include <cstdlib>
#include <cstring> #include <cstring>
#include <locale> #include <locale>
#include <sstream> #include <sstream>
@ -44,6 +45,15 @@ namespace {
char const* const asc_bar[8] = {"=", "=", "=", "=", "=", "=", "=", "="}; char const* const asc_bar[8] = {"=", "=", "=", "=", "=", "=", "=", "="};
char const* const uni_bar[8] = {"", "", "", "", "", "", "", ""}; char const* const uni_bar[8] = {"", "", "", "", "", "", "", ""};
bool is_debug_progress() {
if (auto var = ::getenv("DWARFS_DEBUG_PROGRESS")) {
if (auto val = folly::tryTo<bool>(var)) {
return *val;
}
}
return false;
}
} // namespace } // namespace
console_writer::console_writer(std::ostream& os, progress_mode pg_mode, console_writer::console_writer(std::ostream& os, progress_mode pg_mode,
@ -56,7 +66,8 @@ console_writer::console_writer(std::ostream& os, progress_mode pg_mode,
, width_(width) , width_(width)
, mode_(mode) , mode_(mode)
, color_(stream_is_fancy_terminal(os)) , color_(stream_is_fancy_terminal(os))
, with_context_(with_context) { , with_context_(with_context)
, debug_progress_(is_debug_progress()) {
os_.imbue(std::locale(os_.getloc(), os_.imbue(std::locale(os_.getloc(),
new boost::posix_time::time_facet("%H:%M:%S.%f"))); new boost::posix_time::time_facet("%H:%M:%S.%f")));
if (threshold > level_type::INFO) { if (threshold > level_type::INFO) {
@ -176,7 +187,18 @@ void console_writer::update(const progress& p, bool last) {
<< p.files_found - p.duplicate_files << " inodes)" << newline << p.files_found - p.duplicate_files << " inodes)" << newline
<< "compressed filesystem: " << p.blocks_written << " blocks/" << "compressed filesystem: " << p.blocks_written << " blocks/"
<< size_with_unit(p.compressed_size) << " written" << newline; << size_with_unit(p.compressed_size) << " written";
if (debug_progress_) {
oss << " [" << p.nilsimsa_depth << "/" << p.blockify_queue << "/"
<< p.compress_queue << "]";
} else {
if (p.nilsimsa_depth > 0) {
oss << " [depth: " << p.nilsimsa_depth << "]";
}
}
oss << newline;
break; break;
case REWRITE: case REWRITE:

View File

@ -218,6 +218,7 @@ class filesystem_writer_ : public filesystem_writer::impl {
folly::ByteRange data) override; folly::ByteRange data) override;
void flush() override; void flush() override;
size_t size() const override { return os_.tellp(); } size_t size() const override { return os_.tellp(); }
int queue_fill() const override { return static_cast<int>(wg_.queue_size()); }
private: private:
void write_section(section_type type, std::vector<uint8_t>&& data, void write_section(section_type type, std::vector<uint8_t>&& data,

View File

@ -37,6 +37,7 @@
#include "dwarfs/nilsimsa.h" #include "dwarfs/nilsimsa.h"
#include "dwarfs/options.h" #include "dwarfs/options.h"
#include "dwarfs/os_access.h" #include "dwarfs/os_access.h"
#include "dwarfs/progress.h"
#include "dwarfs/script.h" #include "dwarfs/script.h"
#include "dwarfs/similarity.h" #include "dwarfs/similarity.h"
@ -149,8 +150,9 @@ class nilsimsa_cache_entry {
template <typename LoggerPolicy> template <typename LoggerPolicy>
class inode_manager_ : public inode_manager::impl { class inode_manager_ : public inode_manager::impl {
public: public:
inode_manager_(logger& lgr) inode_manager_(logger& lgr, progress& prog)
: log_(lgr) {} : LOG_PROXY_INIT(lgr)
, prog_(prog) {}
std::shared_ptr<inode> create_inode() override { std::shared_ptr<inode> create_inode() override {
auto ino = std::make_shared<inode_>(); auto ino = std::make_shared<inode_>();
@ -162,7 +164,7 @@ class inode_manager_ : public inode_manager::impl {
void order_inodes(std::shared_ptr<script> scr, void order_inodes(std::shared_ptr<script> scr,
file_order_options const& file_order, uint32_t first_inode, file_order_options const& file_order, uint32_t first_inode,
inode_manager::inode_cb const& fn) override; inode_manager::order_cb const& fn) override;
void void
for_each_inode(std::function<void(std::shared_ptr<inode> const&)> const& fn) for_each_inode(std::function<void(std::shared_ptr<inode> const&)> const& fn)
@ -215,7 +217,7 @@ class inode_manager_ : public inode_manager::impl {
std::vector<uint32_t>& index); std::vector<uint32_t>& index);
void void
order_inodes_by_nilsimsa(inode_manager::inode_cb const& fn, uint32_t inode_no, order_inodes_by_nilsimsa(inode_manager::order_cb const& fn, uint32_t inode_no,
file_order_options const& file_order); file_order_options const& file_order);
void number_inodes(size_t first_no) { void number_inodes(size_t first_no) {
@ -225,13 +227,14 @@ class inode_manager_ : public inode_manager::impl {
} }
std::vector<std::shared_ptr<inode>> inodes_; std::vector<std::shared_ptr<inode>> inodes_;
log_proxy<LoggerPolicy> log_; LOG_PROXY_DECL(LoggerPolicy);
progress& prog_;
}; };
template <typename LoggerPolicy> template <typename LoggerPolicy>
void inode_manager_<LoggerPolicy>::order_inodes( void inode_manager_<LoggerPolicy>::order_inodes(
std::shared_ptr<script> scr, file_order_options const& file_order, std::shared_ptr<script> scr, file_order_options const& file_order,
uint32_t first_inode, inode_manager::inode_cb const& fn) { uint32_t first_inode, inode_manager::order_cb const& fn) {
switch (file_order.mode) { switch (file_order.mode) {
case file_order_mode::NONE: case file_order_mode::NONE:
LOG_INFO << "keeping inode order"; LOG_INFO << "keeping inode order";
@ -276,7 +279,9 @@ void inode_manager_<LoggerPolicy>::order_inodes(
LOG_INFO << "assigning file inodes..."; LOG_INFO << "assigning file inodes...";
number_inodes(first_inode); number_inodes(first_inode);
for_each_inode(fn); for (const auto& ino : inodes_) {
fn(ino);
}
} }
template <typename LoggerPolicy> template <typename LoggerPolicy>
@ -322,7 +327,7 @@ void inode_manager_<LoggerPolicy>::presort_index(
template <typename LoggerPolicy> template <typename LoggerPolicy>
void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa( void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
inode_manager::inode_cb const& fn, uint32_t inode_no, inode_manager::order_cb const& fn, uint32_t inode_no,
file_order_options const& file_order) { file_order_options const& file_order) {
auto count = inodes_.size(); auto count = inodes_.size();
@ -340,7 +345,7 @@ void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
inodes_.push_back(std::move(inodes[index.back()])); inodes_.push_back(std::move(inodes[index.back()]));
index.pop_back(); index.pop_back();
inodes_.back()->set_num(inode_no++); inodes_.back()->set_num(inode_no++);
fn(inodes_.back()); return fn(inodes_.back());
}; };
if (empty != index.end()) { if (empty != index.end()) {
@ -349,10 +354,15 @@ void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
} }
if (!index.empty()) { if (!index.empty()) {
const int depth = file_order.nilsimsa_depth; const int_fast32_t max_depth = file_order.nilsimsa_depth;
const int limit = file_order.nilsimsa_limit; const int_fast32_t min_depth =
std::min<int32_t>(file_order.nilsimsa_min_depth, max_depth);
const int_fast32_t limit = file_order.nilsimsa_limit;
int_fast32_t depth = max_depth;
int64_t processed = 0;
LOG_INFO << "nilsimsa: depth=" << depth << ", limit=" << limit; LOG_INFO << "nilsimsa: depth=" << depth << " (" << min_depth
<< "), limit=" << limit;
presort_index(inodes, index); presort_index(inodes, index);
@ -361,12 +371,12 @@ void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
while (!index.empty()) { while (!index.empty()) {
auto* ref_hash = inodes_.back()->nilsimsa_similarity_hash().data(); auto* ref_hash = inodes_.back()->nilsimsa_similarity_hash().data();
int max_sim = 0; int_fast32_t max_sim = 0;
int max_sim_ix = 0; int_fast32_t max_sim_ix = 0;
int end = int(index.size()) > depth ? index.size() - depth : 0; int_fast32_t end = int(index.size()) > depth ? index.size() - depth : 0;
for (int i = index.size() - 1; i >= end; --i) { for (int_fast32_t i = index.size() - 1; i >= end; --i) {
auto sim = dwarfs::nilsimsa_similarity( auto sim = dwarfs::nilsimsa_similarity(
ref_hash, inodes[index[i]]->nilsimsa_similarity_hash().data()); ref_hash, inodes[index[i]]->nilsimsa_similarity_hash().data());
@ -385,7 +395,22 @@ void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
std::rotate(index.begin() + max_sim_ix, index.begin() + max_sim_ix + 1, std::rotate(index.begin() + max_sim_ix, index.begin() + max_sim_ix + 1,
index.end()); index.end());
finalize_inode(); auto fill = finalize_inode();
if (++processed >= 4096 && processed % 32 == 0) {
constexpr int64_t smooth = 512;
auto target_depth = fill * max_depth / 2048;
depth = ((smooth - 1) * depth + target_depth) / smooth;
if (depth > max_depth) {
depth = max_depth;
} else if (depth < min_depth) {
depth = min_depth;
}
}
prog_.nilsimsa_depth = depth;
} }
} }
@ -394,8 +419,8 @@ void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
} }
} }
inode_manager::inode_manager(logger& lgr) inode_manager::inode_manager(logger& lgr, progress& prog)
: impl_(make_unique_logging_object<impl, inode_manager_, logger_policies>( : impl_(make_unique_logging_object<impl, inode_manager_, logger_policies>(
lgr)) {} lgr, prog)) {}
} // namespace dwarfs } // namespace dwarfs

View File

@ -485,7 +485,7 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
LOG_INFO << "finding duplicate files..."; LOG_INFO << "finding duplicate files...";
inode_manager im(lgr_); inode_manager im(lgr_, prog);
file_deduplication_visitor fdv; file_deduplication_visitor fdv;
root->accept(fdv); root->accept(fdv);
@ -540,7 +540,7 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
LOG_INFO << "building blocks..."; LOG_INFO << "building blocks...";
block_manager bm(lgr_, prog, cfg_, os_, fsw); block_manager bm(lgr_, prog, cfg_, os_, fsw);
worker_group blockify("blockify"); worker_group blockify("blockify", 1, 1 << 20);
im.order_inodes(script_, options_.file_order, first_file_inode, im.order_inodes(script_, options_.file_order, first_file_inode,
[&](std::shared_ptr<inode> const& ino) { [&](std::shared_ptr<inode> const& ino) {
@ -549,6 +549,12 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
bm.add_inode(ino); bm.add_inode(ino);
prog.inodes_written++; prog.inodes_written++;
}); });
auto queued_files = blockify.queue_size();
auto queued_blocks = fsw.queue_fill();
prog.blockify_queue = queued_files;
prog.compress_queue = queued_blocks;
return INT64_C(500) * queued_blocks +
static_cast<int64_t>(queued_files);
}); });
LOG_INFO << "waiting for segmenting/blockifying to finish..."; LOG_INFO << "waiting for segmenting/blockifying to finish...";

View File

@ -30,6 +30,7 @@
#include <iterator> #include <iterator>
#include <map> #include <map>
#include <memory> #include <memory>
#include <optional>
#include <sstream> #include <sstream>
#include <stdexcept> #include <stdexcept>
#include <string> #include <string>
@ -171,6 +172,40 @@ class script_options : public options_interface {
namespace { namespace {
int parse_order_option(std::string const& ordname, std::string const& opt,
int& value, std::string_view name,
std::optional<int> min = std::nullopt,
std::optional<int> max = std::nullopt) {
if (!opt.empty()) {
if (auto val = folly::tryTo<int>(opt)) {
auto tmp = *val;
if (min && max && (tmp < *min || tmp > *max)) {
std::cerr << "error: " << name << " (" << opt
<< ") out of range for order '" << ordname << "' (" << *min
<< ".." << *max << ")" << std::endl;
return 1;
}
if (min && tmp < *min) {
std::cerr << "error: " << name << " (" << opt
<< ") cannot be less than " << *min << " for order '"
<< ordname << "'" << std::endl;
}
if (max && tmp > *max) {
std::cerr << "error: " << name << " (" << opt
<< ") cannot be greater than " << *max << " for order '"
<< ordname << "'" << std::endl;
}
value = tmp;
} else {
std::cerr << "error: " << name << " (" << opt
<< ") is not numeric for order '" << ordname << "'"
<< std::endl;
return 1;
}
}
return 0;
}
size_t get_term_width() { size_t get_term_width() {
struct ::winsize w; struct ::winsize w;
::ioctl(STDOUT_FILENO, TIOCGWINSZ, &w); ::ioctl(STDOUT_FILENO, TIOCGWINSZ, &w);
@ -281,7 +316,7 @@ constexpr std::array<level_defaults, 10> levels{{
/* 6 */ {24, ALG_DATA_6, ALG_SCHEMA, "null", "16,14,12", "nilsimsa"}, /* 6 */ {24, ALG_DATA_6, ALG_SCHEMA, "null", "16,14,12", "nilsimsa"},
/* 7 */ {24, ALG_DATA_7, ALG_SCHEMA, ALG_METADATA_7, "16,14,12", "nilsimsa"}, /* 7 */ {24, ALG_DATA_7, ALG_SCHEMA, ALG_METADATA_7, "16,14,12", "nilsimsa"},
/* 8 */ {24, ALG_DATA_8, ALG_SCHEMA, ALG_METADATA_9, "16,14,12", "nilsimsa"}, /* 8 */ {24, ALG_DATA_8, ALG_SCHEMA, ALG_METADATA_9, "16,14,12", "nilsimsa"},
/* 9 */ {26, ALG_DATA_9, ALG_SCHEMA, ALG_METADATA_9, "16,14,12", "nilsimsa:255:50000"}, /* 9 */ {26, ALG_DATA_9, ALG_SCHEMA, ALG_METADATA_9, "16,14,12", "nilsimsa::50000"},
// clang-format on // clang-format on
}}; }};
@ -548,40 +583,31 @@ int mkdwarfs(int argc, char** argv) {
return 1; return 1;
} }
if (order_opts.size() > 3) { if (order_opts.size() > 4) {
std::cerr << "error: too many options for inode order mode '" std::cerr << "error: too many options for inode order mode '"
<< order_opts[0] << "'" << std::endl; << order_opts[0] << "'" << std::endl;
return 1; return 1;
} }
if (auto val = folly::tryTo<int>(order_opts[1])) { auto ordname = order_opts[0];
options.file_order.nilsimsa_limit = *val;
if (options.file_order.nilsimsa_limit < 0 || if (parse_order_option(ordname, order_opts[1],
options.file_order.nilsimsa_limit > 255) { options.file_order.nilsimsa_limit, "limit", 0,
std::cerr << "error: limit (" << order_opts[1] 255)) {
<< ") out of range for order '" << order_opts[0]
<< "' (0..255)" << std::endl;
return 1;
}
} else {
std::cerr << "error: limit (" << order_opts[1]
<< ") is not numeric for order '" << order_opts[0] << "'"
<< std::endl;
return 1; return 1;
} }
if (order_opts.size() > 2) { if (order_opts.size() > 2) {
if (auto val = folly::tryTo<int>(order_opts[2])) { if (parse_order_option(ordname, order_opts[2],
options.file_order.nilsimsa_depth = *val; options.file_order.nilsimsa_depth, "depth", 0)) {
if (options.file_order.nilsimsa_depth < 0) { return 1;
std::cerr << "error: depth (" << order_opts[2] }
<< ") cannot be negative for order '" << order_opts[0] }
<< "'" << std::endl;
} if (order_opts.size() > 3) {
} else { if (parse_order_option(ordname, order_opts[3],
std::cerr << "error: depth (" << order_opts[2] options.file_order.nilsimsa_min_depth,
<< ") is not numeric for order '" << order_opts[0] << "'" "min depth", 0)) {
<< std::endl;
return 1; return 1;
} }
} }