mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-10 13:04:15 -04:00
Dynamically adapt nilsimsa search depth
This commit is contained in:
parent
6a328999be
commit
201b6ddb3b
@ -154,7 +154,7 @@ Most other options are concerned with compression tuning:
|
||||
the `mtime` field in order to save metadata space. If you want to save
|
||||
`atime` and `ctime` as well, use this option.
|
||||
|
||||
* `--order=none`|`path`|`similarity`|`nilsimsa`[`:`*limit*[`:`*depth*]]|`script`:
|
||||
* `--order=none`|`path`|`similarity`|`nilsimsa`[`:`*limit*[`:`*depth*[`:`*mindepth*]]]|`script`:
|
||||
The order in which inodes will be written to the file system. Choosing `none`,
|
||||
the inodes will be stored in the order in which they are discovered. With
|
||||
`path`, they will be sorted asciibetically by path name of the first file
|
||||
@ -168,9 +168,15 @@ Most other options are concerned with compression tuning:
|
||||
enough for adding. A *limit* of 255 means "essentially identical", whereas
|
||||
a *limit* of 0 means "not similar at all". The *depth* determines up to
|
||||
how many inodes can be checked at most while searching for a similar one.
|
||||
The default if you omit these values is a *limit* of 255 and a *depth*
|
||||
of 20000. Last but not least, if scripting support is built into `mkdwarfs`,
|
||||
you can choose `script` to let the script determine the order.
|
||||
To avoid nilsimsa ordering to become a bottleneck when ordering lots of
|
||||
small files, the *depth* is adjusted dynamically to keep the input queue
|
||||
to the segmentation/compression stages adequately filled. You can specify
|
||||
how much the *depth* can be adjusted by also specifying *mindepth*.
|
||||
The default if you omit these values is a *limit* of 255, a *depth*
|
||||
of 20000 and a *mindepth* of 1000. Note that if you want reproducible
|
||||
results, you need to set *depth* and *mindepth* to the same value.
|
||||
Last but not least, if scripting support is built into `mkdwarfs`, you can
|
||||
choose `script` to let the script determine the order.
|
||||
|
||||
* `--blockhash-window-sizes=`*value*[,*value*]...:
|
||||
Window sizes used for block hashing. These sizes, separated by commas,
|
||||
|
@ -61,5 +61,6 @@ class console_writer : public logger {
|
||||
display_mode const mode_;
|
||||
bool const color_;
|
||||
bool const with_context_;
|
||||
bool const debug_progress_;
|
||||
};
|
||||
} // namespace dwarfs
|
||||
|
@ -87,6 +87,8 @@ class filesystem_writer {
|
||||
|
||||
size_t size() const { return impl_->size(); }
|
||||
|
||||
int queue_fill() const { return impl_->queue_fill(); }
|
||||
|
||||
class impl {
|
||||
public:
|
||||
virtual ~impl() = default;
|
||||
@ -99,6 +101,7 @@ class filesystem_writer {
|
||||
folly::ByteRange data) = 0;
|
||||
virtual void flush() = 0;
|
||||
virtual size_t size() const = 0;
|
||||
virtual int queue_fill() const = 0;
|
||||
};
|
||||
|
||||
private:
|
||||
|
@ -29,6 +29,7 @@ namespace dwarfs {
|
||||
|
||||
class inode;
|
||||
class logger;
|
||||
class progress;
|
||||
class script;
|
||||
|
||||
struct file_order_options;
|
||||
@ -36,8 +37,9 @@ struct file_order_options;
|
||||
class inode_manager {
|
||||
public:
|
||||
using inode_cb = std::function<void(std::shared_ptr<inode> const&)>;
|
||||
using order_cb = std::function<int64_t(std::shared_ptr<inode> const&)>;
|
||||
|
||||
inode_manager(logger& lgr);
|
||||
inode_manager(logger& lgr, progress& prog);
|
||||
|
||||
std::shared_ptr<inode> create_inode() { return impl_->create_inode(); }
|
||||
|
||||
@ -45,7 +47,7 @@ class inode_manager {
|
||||
|
||||
void order_inodes(std::shared_ptr<script> scr,
|
||||
file_order_options const& file_order, uint32_t first_inode,
|
||||
inode_cb const& fn) {
|
||||
order_cb const& fn) {
|
||||
impl_->order_inodes(std::move(scr), file_order, first_inode, fn);
|
||||
}
|
||||
|
||||
@ -59,7 +61,7 @@ class inode_manager {
|
||||
virtual size_t count() const = 0;
|
||||
virtual void order_inodes(std::shared_ptr<script> scr,
|
||||
file_order_options const& file_order,
|
||||
uint32_t first_inode, inode_cb const& fn) = 0;
|
||||
uint32_t first_inode, order_cb const& fn) = 0;
|
||||
virtual void for_each_inode(
|
||||
std::function<void(std::shared_ptr<inode> const&)> const& fn) const = 0;
|
||||
};
|
||||
|
@ -59,6 +59,7 @@ enum class file_order_mode { NONE, PATH, SCRIPT, SIMILARITY, NILSIMSA };
|
||||
struct file_order_options {
|
||||
file_order_mode mode{file_order_mode::NONE};
|
||||
int nilsimsa_depth{20000};
|
||||
int nilsimsa_min_depth{1000};
|
||||
int nilsimsa_limit{255};
|
||||
};
|
||||
|
||||
|
@ -70,6 +70,9 @@ class progress {
|
||||
std::atomic<size_t> inodes_written{0};
|
||||
std::atomic<size_t> blocks_written{0};
|
||||
std::atomic<size_t> errors{0};
|
||||
std::atomic<size_t> nilsimsa_depth{0};
|
||||
std::atomic<size_t> blockify_queue{0};
|
||||
std::atomic<size_t> compress_queue{0};
|
||||
std::atomic<uint64_t> original_size{0};
|
||||
std::atomic<uint64_t> hardlink_size{0};
|
||||
std::atomic<uint64_t> saved_by_deduplication{0};
|
||||
|
@ -19,6 +19,7 @@
|
||||
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <locale>
|
||||
#include <sstream>
|
||||
@ -44,6 +45,15 @@ namespace {
|
||||
char const* const asc_bar[8] = {"=", "=", "=", "=", "=", "=", "=", "="};
|
||||
char const* const uni_bar[8] = {"▏", "▎", "▍", "▌", "▋", "▊", "▉", "█"};
|
||||
|
||||
bool is_debug_progress() {
|
||||
if (auto var = ::getenv("DWARFS_DEBUG_PROGRESS")) {
|
||||
if (auto val = folly::tryTo<bool>(var)) {
|
||||
return *val;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
console_writer::console_writer(std::ostream& os, progress_mode pg_mode,
|
||||
@ -56,7 +66,8 @@ console_writer::console_writer(std::ostream& os, progress_mode pg_mode,
|
||||
, width_(width)
|
||||
, mode_(mode)
|
||||
, color_(stream_is_fancy_terminal(os))
|
||||
, with_context_(with_context) {
|
||||
, with_context_(with_context)
|
||||
, debug_progress_(is_debug_progress()) {
|
||||
os_.imbue(std::locale(os_.getloc(),
|
||||
new boost::posix_time::time_facet("%H:%M:%S.%f")));
|
||||
if (threshold > level_type::INFO) {
|
||||
@ -176,7 +187,18 @@ void console_writer::update(const progress& p, bool last) {
|
||||
<< p.files_found - p.duplicate_files << " inodes)" << newline
|
||||
|
||||
<< "compressed filesystem: " << p.blocks_written << " blocks/"
|
||||
<< size_with_unit(p.compressed_size) << " written" << newline;
|
||||
<< size_with_unit(p.compressed_size) << " written";
|
||||
|
||||
if (debug_progress_) {
|
||||
oss << " [" << p.nilsimsa_depth << "/" << p.blockify_queue << "/"
|
||||
<< p.compress_queue << "]";
|
||||
} else {
|
||||
if (p.nilsimsa_depth > 0) {
|
||||
oss << " [depth: " << p.nilsimsa_depth << "]";
|
||||
}
|
||||
}
|
||||
|
||||
oss << newline;
|
||||
break;
|
||||
|
||||
case REWRITE:
|
||||
|
@ -218,6 +218,7 @@ class filesystem_writer_ : public filesystem_writer::impl {
|
||||
folly::ByteRange data) override;
|
||||
void flush() override;
|
||||
size_t size() const override { return os_.tellp(); }
|
||||
int queue_fill() const override { return static_cast<int>(wg_.queue_size()); }
|
||||
|
||||
private:
|
||||
void write_section(section_type type, std::vector<uint8_t>&& data,
|
||||
|
@ -37,6 +37,7 @@
|
||||
#include "dwarfs/nilsimsa.h"
|
||||
#include "dwarfs/options.h"
|
||||
#include "dwarfs/os_access.h"
|
||||
#include "dwarfs/progress.h"
|
||||
#include "dwarfs/script.h"
|
||||
#include "dwarfs/similarity.h"
|
||||
|
||||
@ -149,8 +150,9 @@ class nilsimsa_cache_entry {
|
||||
template <typename LoggerPolicy>
|
||||
class inode_manager_ : public inode_manager::impl {
|
||||
public:
|
||||
inode_manager_(logger& lgr)
|
||||
: log_(lgr) {}
|
||||
inode_manager_(logger& lgr, progress& prog)
|
||||
: LOG_PROXY_INIT(lgr)
|
||||
, prog_(prog) {}
|
||||
|
||||
std::shared_ptr<inode> create_inode() override {
|
||||
auto ino = std::make_shared<inode_>();
|
||||
@ -162,7 +164,7 @@ class inode_manager_ : public inode_manager::impl {
|
||||
|
||||
void order_inodes(std::shared_ptr<script> scr,
|
||||
file_order_options const& file_order, uint32_t first_inode,
|
||||
inode_manager::inode_cb const& fn) override;
|
||||
inode_manager::order_cb const& fn) override;
|
||||
|
||||
void
|
||||
for_each_inode(std::function<void(std::shared_ptr<inode> const&)> const& fn)
|
||||
@ -215,7 +217,7 @@ class inode_manager_ : public inode_manager::impl {
|
||||
std::vector<uint32_t>& index);
|
||||
|
||||
void
|
||||
order_inodes_by_nilsimsa(inode_manager::inode_cb const& fn, uint32_t inode_no,
|
||||
order_inodes_by_nilsimsa(inode_manager::order_cb const& fn, uint32_t inode_no,
|
||||
file_order_options const& file_order);
|
||||
|
||||
void number_inodes(size_t first_no) {
|
||||
@ -225,13 +227,14 @@ class inode_manager_ : public inode_manager::impl {
|
||||
}
|
||||
|
||||
std::vector<std::shared_ptr<inode>> inodes_;
|
||||
log_proxy<LoggerPolicy> log_;
|
||||
LOG_PROXY_DECL(LoggerPolicy);
|
||||
progress& prog_;
|
||||
};
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
void inode_manager_<LoggerPolicy>::order_inodes(
|
||||
std::shared_ptr<script> scr, file_order_options const& file_order,
|
||||
uint32_t first_inode, inode_manager::inode_cb const& fn) {
|
||||
uint32_t first_inode, inode_manager::order_cb const& fn) {
|
||||
switch (file_order.mode) {
|
||||
case file_order_mode::NONE:
|
||||
LOG_INFO << "keeping inode order";
|
||||
@ -276,7 +279,9 @@ void inode_manager_<LoggerPolicy>::order_inodes(
|
||||
|
||||
LOG_INFO << "assigning file inodes...";
|
||||
number_inodes(first_inode);
|
||||
for_each_inode(fn);
|
||||
for (const auto& ino : inodes_) {
|
||||
fn(ino);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
@ -322,7 +327,7 @@ void inode_manager_<LoggerPolicy>::presort_index(
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
|
||||
inode_manager::inode_cb const& fn, uint32_t inode_no,
|
||||
inode_manager::order_cb const& fn, uint32_t inode_no,
|
||||
file_order_options const& file_order) {
|
||||
auto count = inodes_.size();
|
||||
|
||||
@ -340,7 +345,7 @@ void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
|
||||
inodes_.push_back(std::move(inodes[index.back()]));
|
||||
index.pop_back();
|
||||
inodes_.back()->set_num(inode_no++);
|
||||
fn(inodes_.back());
|
||||
return fn(inodes_.back());
|
||||
};
|
||||
|
||||
if (empty != index.end()) {
|
||||
@ -349,10 +354,15 @@ void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
|
||||
}
|
||||
|
||||
if (!index.empty()) {
|
||||
const int depth = file_order.nilsimsa_depth;
|
||||
const int limit = file_order.nilsimsa_limit;
|
||||
const int_fast32_t max_depth = file_order.nilsimsa_depth;
|
||||
const int_fast32_t min_depth =
|
||||
std::min<int32_t>(file_order.nilsimsa_min_depth, max_depth);
|
||||
const int_fast32_t limit = file_order.nilsimsa_limit;
|
||||
int_fast32_t depth = max_depth;
|
||||
int64_t processed = 0;
|
||||
|
||||
LOG_INFO << "nilsimsa: depth=" << depth << ", limit=" << limit;
|
||||
LOG_INFO << "nilsimsa: depth=" << depth << " (" << min_depth
|
||||
<< "), limit=" << limit;
|
||||
|
||||
presort_index(inodes, index);
|
||||
|
||||
@ -361,12 +371,12 @@ void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
|
||||
while (!index.empty()) {
|
||||
auto* ref_hash = inodes_.back()->nilsimsa_similarity_hash().data();
|
||||
|
||||
int max_sim = 0;
|
||||
int max_sim_ix = 0;
|
||||
int_fast32_t max_sim = 0;
|
||||
int_fast32_t max_sim_ix = 0;
|
||||
|
||||
int end = int(index.size()) > depth ? index.size() - depth : 0;
|
||||
int_fast32_t end = int(index.size()) > depth ? index.size() - depth : 0;
|
||||
|
||||
for (int i = index.size() - 1; i >= end; --i) {
|
||||
for (int_fast32_t i = index.size() - 1; i >= end; --i) {
|
||||
auto sim = dwarfs::nilsimsa_similarity(
|
||||
ref_hash, inodes[index[i]]->nilsimsa_similarity_hash().data());
|
||||
|
||||
@ -385,7 +395,22 @@ void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
|
||||
std::rotate(index.begin() + max_sim_ix, index.begin() + max_sim_ix + 1,
|
||||
index.end());
|
||||
|
||||
finalize_inode();
|
||||
auto fill = finalize_inode();
|
||||
|
||||
if (++processed >= 4096 && processed % 32 == 0) {
|
||||
constexpr int64_t smooth = 512;
|
||||
auto target_depth = fill * max_depth / 2048;
|
||||
|
||||
depth = ((smooth - 1) * depth + target_depth) / smooth;
|
||||
|
||||
if (depth > max_depth) {
|
||||
depth = max_depth;
|
||||
} else if (depth < min_depth) {
|
||||
depth = min_depth;
|
||||
}
|
||||
}
|
||||
|
||||
prog_.nilsimsa_depth = depth;
|
||||
}
|
||||
}
|
||||
|
||||
@ -394,8 +419,8 @@ void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
|
||||
}
|
||||
}
|
||||
|
||||
inode_manager::inode_manager(logger& lgr)
|
||||
inode_manager::inode_manager(logger& lgr, progress& prog)
|
||||
: impl_(make_unique_logging_object<impl, inode_manager_, logger_policies>(
|
||||
lgr)) {}
|
||||
lgr, prog)) {}
|
||||
|
||||
} // namespace dwarfs
|
||||
|
@ -485,7 +485,7 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
||||
|
||||
LOG_INFO << "finding duplicate files...";
|
||||
|
||||
inode_manager im(lgr_);
|
||||
inode_manager im(lgr_, prog);
|
||||
|
||||
file_deduplication_visitor fdv;
|
||||
root->accept(fdv);
|
||||
@ -540,7 +540,7 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
||||
LOG_INFO << "building blocks...";
|
||||
block_manager bm(lgr_, prog, cfg_, os_, fsw);
|
||||
|
||||
worker_group blockify("blockify");
|
||||
worker_group blockify("blockify", 1, 1 << 20);
|
||||
|
||||
im.order_inodes(script_, options_.file_order, first_file_inode,
|
||||
[&](std::shared_ptr<inode> const& ino) {
|
||||
@ -549,6 +549,12 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
||||
bm.add_inode(ino);
|
||||
prog.inodes_written++;
|
||||
});
|
||||
auto queued_files = blockify.queue_size();
|
||||
auto queued_blocks = fsw.queue_fill();
|
||||
prog.blockify_queue = queued_files;
|
||||
prog.compress_queue = queued_blocks;
|
||||
return INT64_C(500) * queued_blocks +
|
||||
static_cast<int64_t>(queued_files);
|
||||
});
|
||||
|
||||
LOG_INFO << "waiting for segmenting/blockifying to finish...";
|
||||
|
@ -30,6 +30,7 @@
|
||||
#include <iterator>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
@ -171,6 +172,40 @@ class script_options : public options_interface {
|
||||
|
||||
namespace {
|
||||
|
||||
int parse_order_option(std::string const& ordname, std::string const& opt,
|
||||
int& value, std::string_view name,
|
||||
std::optional<int> min = std::nullopt,
|
||||
std::optional<int> max = std::nullopt) {
|
||||
if (!opt.empty()) {
|
||||
if (auto val = folly::tryTo<int>(opt)) {
|
||||
auto tmp = *val;
|
||||
if (min && max && (tmp < *min || tmp > *max)) {
|
||||
std::cerr << "error: " << name << " (" << opt
|
||||
<< ") out of range for order '" << ordname << "' (" << *min
|
||||
<< ".." << *max << ")" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
if (min && tmp < *min) {
|
||||
std::cerr << "error: " << name << " (" << opt
|
||||
<< ") cannot be less than " << *min << " for order '"
|
||||
<< ordname << "'" << std::endl;
|
||||
}
|
||||
if (max && tmp > *max) {
|
||||
std::cerr << "error: " << name << " (" << opt
|
||||
<< ") cannot be greater than " << *max << " for order '"
|
||||
<< ordname << "'" << std::endl;
|
||||
}
|
||||
value = tmp;
|
||||
} else {
|
||||
std::cerr << "error: " << name << " (" << opt
|
||||
<< ") is not numeric for order '" << ordname << "'"
|
||||
<< std::endl;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t get_term_width() {
|
||||
struct ::winsize w;
|
||||
::ioctl(STDOUT_FILENO, TIOCGWINSZ, &w);
|
||||
@ -281,7 +316,7 @@ constexpr std::array<level_defaults, 10> levels{{
|
||||
/* 6 */ {24, ALG_DATA_6, ALG_SCHEMA, "null", "16,14,12", "nilsimsa"},
|
||||
/* 7 */ {24, ALG_DATA_7, ALG_SCHEMA, ALG_METADATA_7, "16,14,12", "nilsimsa"},
|
||||
/* 8 */ {24, ALG_DATA_8, ALG_SCHEMA, ALG_METADATA_9, "16,14,12", "nilsimsa"},
|
||||
/* 9 */ {26, ALG_DATA_9, ALG_SCHEMA, ALG_METADATA_9, "16,14,12", "nilsimsa:255:50000"},
|
||||
/* 9 */ {26, ALG_DATA_9, ALG_SCHEMA, ALG_METADATA_9, "16,14,12", "nilsimsa::50000"},
|
||||
// clang-format on
|
||||
}};
|
||||
|
||||
@ -548,40 +583,31 @@ int mkdwarfs(int argc, char** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (order_opts.size() > 3) {
|
||||
if (order_opts.size() > 4) {
|
||||
std::cerr << "error: too many options for inode order mode '"
|
||||
<< order_opts[0] << "'" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (auto val = folly::tryTo<int>(order_opts[1])) {
|
||||
options.file_order.nilsimsa_limit = *val;
|
||||
if (options.file_order.nilsimsa_limit < 0 ||
|
||||
options.file_order.nilsimsa_limit > 255) {
|
||||
std::cerr << "error: limit (" << order_opts[1]
|
||||
<< ") out of range for order '" << order_opts[0]
|
||||
<< "' (0..255)" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
std::cerr << "error: limit (" << order_opts[1]
|
||||
<< ") is not numeric for order '" << order_opts[0] << "'"
|
||||
<< std::endl;
|
||||
auto ordname = order_opts[0];
|
||||
|
||||
if (parse_order_option(ordname, order_opts[1],
|
||||
options.file_order.nilsimsa_limit, "limit", 0,
|
||||
255)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (order_opts.size() > 2) {
|
||||
if (auto val = folly::tryTo<int>(order_opts[2])) {
|
||||
options.file_order.nilsimsa_depth = *val;
|
||||
if (options.file_order.nilsimsa_depth < 0) {
|
||||
std::cerr << "error: depth (" << order_opts[2]
|
||||
<< ") cannot be negative for order '" << order_opts[0]
|
||||
<< "'" << std::endl;
|
||||
if (parse_order_option(ordname, order_opts[2],
|
||||
options.file_order.nilsimsa_depth, "depth", 0)) {
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
std::cerr << "error: depth (" << order_opts[2]
|
||||
<< ") is not numeric for order '" << order_opts[0] << "'"
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
if (order_opts.size() > 3) {
|
||||
if (parse_order_option(ordname, order_opts[3],
|
||||
options.file_order.nilsimsa_min_depth,
|
||||
"min depth", 0)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user