mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-10 13:04:15 -04:00
Improve segmenter logging
This commit is contained in:
parent
5a9c7f823e
commit
a9636a3f0b
@ -40,6 +40,7 @@ struct compression_constraints;
|
|||||||
class segmenter {
|
class segmenter {
|
||||||
public:
|
public:
|
||||||
struct config {
|
struct config {
|
||||||
|
std::string context;
|
||||||
unsigned blockhash_window_size{12};
|
unsigned blockhash_window_size{12};
|
||||||
unsigned window_increment_shift{1};
|
unsigned window_increment_shift{1};
|
||||||
size_t max_active_blocks{1};
|
size_t max_active_blocks{1};
|
||||||
|
@ -28,6 +28,7 @@
|
|||||||
|
|
||||||
namespace dwarfs {
|
namespace dwarfs {
|
||||||
|
|
||||||
|
class categorizer_manager;
|
||||||
class logger;
|
class logger;
|
||||||
class progress;
|
class progress;
|
||||||
|
|
||||||
@ -43,6 +44,10 @@ class segmenter_factory {
|
|||||||
unsigned block_size_bits{22};
|
unsigned block_size_bits{22};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
segmenter_factory(logger& lgr, progress& prog,
|
||||||
|
std::shared_ptr<categorizer_manager> catmgr,
|
||||||
|
config const& cfg);
|
||||||
|
|
||||||
segmenter_factory(logger& lgr, progress& prog, config const& cfg);
|
segmenter_factory(logger& lgr, progress& prog, config const& cfg);
|
||||||
|
|
||||||
segmenter create(fragment_category cat, compression_constraints const& cc,
|
segmenter create(fragment_category cat, compression_constraints const& cc,
|
||||||
|
@ -619,9 +619,10 @@ class segmenter_ final : public segmenter::impl, private SegmentingPolicy {
|
|||||||
, global_filter_{bloom_filter_size(cfg)}
|
, global_filter_{bloom_filter_size(cfg)}
|
||||||
, match_counts_{1, 0, 128} {
|
, match_counts_{1, 0, 128} {
|
||||||
if constexpr (is_segmentation_enabled()) {
|
if constexpr (is_segmentation_enabled()) {
|
||||||
LOG_INFO << "using a " << size_with_unit(window_size_) << " window at "
|
LOG_VERBOSE << cfg_.context << "using a " << size_with_unit(window_size_)
|
||||||
<< size_with_unit(window_step_) << " steps for segment analysis";
|
<< " window at " << size_with_unit(window_step_)
|
||||||
LOG_INFO << "bloom filter size: "
|
<< " steps for segment analysis";
|
||||||
|
LOG_VERBOSE << cfg_.context << "bloom filter size: "
|
||||||
<< size_with_unit(global_filter_.size() / 8);
|
<< size_with_unit(global_filter_.size() / 8);
|
||||||
|
|
||||||
repeating_sequence_hash_values_.reserve(256);
|
repeating_sequence_hash_values_.reserve(256);
|
||||||
@ -851,7 +852,7 @@ void segmenter_<LoggerPolicy, SegmentingPolicy>::add_chunkable(
|
|||||||
chunkable& chkable) {
|
chunkable& chkable) {
|
||||||
if (auto size_in_frames = bytes_to_frames(chkable.size());
|
if (auto size_in_frames = bytes_to_frames(chkable.size());
|
||||||
size_in_frames > 0) {
|
size_in_frames > 0) {
|
||||||
LOG_TRACE << "adding " << chkable.description();
|
LOG_TRACE << cfg_.context << "adding " << chkable.description();
|
||||||
|
|
||||||
if (!is_segmentation_enabled() or size_in_frames < window_size_) {
|
if (!is_segmentation_enabled() or size_in_frames < window_size_) {
|
||||||
// no point dealing with hashing, just write it out
|
// no point dealing with hashing, just write it out
|
||||||
@ -872,7 +873,7 @@ void segmenter_<LoggerPolicy, SegmentingPolicy>::finish() {
|
|||||||
auto l1_collisions = stats_.l2_collision_vec_size.computeTotalCount();
|
auto l1_collisions = stats_.l2_collision_vec_size.computeTotalCount();
|
||||||
|
|
||||||
if (stats_.bloom_lookups > 0) {
|
if (stats_.bloom_lookups > 0) {
|
||||||
LOG_INFO << "bloom filter reject rate: "
|
LOG_VERBOSE << cfg_.context << "bloom filter reject rate: "
|
||||||
<< fmt::format("{:.3f}%", 100.0 - 100.0 * stats_.bloom_hits /
|
<< fmt::format("{:.3f}%", 100.0 - 100.0 * stats_.bloom_hits /
|
||||||
stats_.bloom_lookups)
|
stats_.bloom_lookups)
|
||||||
<< " (TPR="
|
<< " (TPR="
|
||||||
@ -881,20 +882,21 @@ void segmenter_<LoggerPolicy, SegmentingPolicy>::finish() {
|
|||||||
<< ", lookups=" << stats_.bloom_lookups << ")";
|
<< ", lookups=" << stats_.bloom_lookups << ")";
|
||||||
}
|
}
|
||||||
if (stats_.total_matches > 0) {
|
if (stats_.total_matches > 0) {
|
||||||
LOG_INFO << "segmentation matches: good=" << stats_.good_matches
|
LOG_VERBOSE << cfg_.context
|
||||||
|
<< "segmentation matches: good=" << stats_.good_matches
|
||||||
<< ", bad=" << stats_.bad_matches << ", collisions="
|
<< ", bad=" << stats_.bad_matches << ", collisions="
|
||||||
<< (stats_.total_matches -
|
<< (stats_.total_matches -
|
||||||
(stats_.bad_matches + stats_.good_matches))
|
(stats_.bad_matches + stats_.good_matches))
|
||||||
<< ", total=" << stats_.total_matches;
|
<< ", total=" << stats_.total_matches;
|
||||||
}
|
}
|
||||||
if (stats_.total_hashes > 0) {
|
if (stats_.total_hashes > 0) {
|
||||||
LOG_INFO << "segmentation collisions: L1="
|
LOG_VERBOSE << cfg_.context << "segmentation collisions: L1="
|
||||||
<< fmt::format("{:.3f}%",
|
<< fmt::format("{:.3f}%",
|
||||||
100.0 * (l1_collisions + stats_.l2_collisions) /
|
100.0 * (l1_collisions + stats_.l2_collisions) /
|
||||||
stats_.total_hashes)
|
stats_.total_hashes)
|
||||||
<< ", L2="
|
<< ", L2="
|
||||||
<< fmt::format("{:.3f}%",
|
<< fmt::format("{:.3f}%", 100.0 * stats_.l2_collisions /
|
||||||
100.0 * stats_.l2_collisions / stats_.total_hashes)
|
stats_.total_hashes)
|
||||||
<< " [" << stats_.total_hashes << " hashes]";
|
<< " [" << stats_.total_hashes << " hashes]";
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -902,20 +904,22 @@ void segmenter_<LoggerPolicy, SegmentingPolicy>::finish() {
|
|||||||
auto pct = [&](double p) {
|
auto pct = [&](double p) {
|
||||||
return stats_.l2_collision_vec_size.getPercentileEstimate(p);
|
return stats_.l2_collision_vec_size.getPercentileEstimate(p);
|
||||||
};
|
};
|
||||||
LOG_DEBUG << "collision vector size p50: " << pct(0.5)
|
LOG_DEBUG << cfg_.context << "collision vector size p50: " << pct(0.5)
|
||||||
<< ", p75: " << pct(0.75) << ", p90: " << pct(0.9)
|
<< ", p75: " << pct(0.75) << ", p90: " << pct(0.9)
|
||||||
<< ", p95: " << pct(0.95) << ", p99: " << pct(0.99);
|
<< ", p95: " << pct(0.95) << ", p99: " << pct(0.99);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto pct = [&](double p) { return match_counts_.getPercentileEstimate(p); };
|
auto pct = [&](double p) { return match_counts_.getPercentileEstimate(p); };
|
||||||
|
|
||||||
LOG_DEBUG << "match counts p50: " << pct(0.5) << ", p75: " << pct(0.75)
|
LOG_DEBUG << cfg_.context << "match counts p50: " << pct(0.5)
|
||||||
<< ", p90: " << pct(0.9) << ", p95: " << pct(0.95)
|
<< ", p75: " << pct(0.75) << ", p90: " << pct(0.9)
|
||||||
<< ", p99: " << pct(0.99);
|
<< ", p95: " << pct(0.95) << ", p99: " << pct(0.99);
|
||||||
|
|
||||||
for (auto [k, v] : repeating_collisions_) {
|
for (auto [k, v] : repeating_collisions_) {
|
||||||
LOG_INFO << fmt::format("avoided {} collisions in 0x{:02x}-byte sequences",
|
LOG_VERBOSE << cfg_.context
|
||||||
v, k);
|
<< fmt::format(
|
||||||
|
"avoided {} collisions in 0x{:02x}-byte sequences", v,
|
||||||
|
k);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -952,8 +956,9 @@ void segmenter_<LoggerPolicy, SegmentingPolicy>::append_to_block(
|
|||||||
auto const size_in_bytes = frames_to_bytes(size_in_frames);
|
auto const size_in_bytes = frames_to_bytes(size_in_frames);
|
||||||
auto& block = blocks_.back();
|
auto& block = blocks_.back();
|
||||||
|
|
||||||
LOG_TRACE << "appending " << size_in_bytes << " bytes to block "
|
LOG_TRACE << cfg_.context << "appending " << size_in_bytes
|
||||||
<< block.num() << " @ " << frames_to_bytes(block.size_in_frames())
|
<< " bytes to block " << block.num() << " @ "
|
||||||
|
<< frames_to_bytes(block.size_in_frames())
|
||||||
<< " from chunkable offset " << offset_in_bytes;
|
<< " from chunkable offset " << offset_in_bytes;
|
||||||
|
|
||||||
block.append_bytes(chkable.span().subspan(offset_in_bytes, size_in_bytes),
|
block.append_bytes(chkable.span().subspan(offset_in_bytes, size_in_bytes),
|
||||||
@ -1064,7 +1069,7 @@ void segmenter_<LoggerPolicy, SegmentingPolicy>::segment_and_add_data(
|
|||||||
++stats_.bloom_true_positives;
|
++stats_.bloom_true_positives;
|
||||||
match_counts_.addValue(matches.size());
|
match_counts_.addValue(matches.size());
|
||||||
|
|
||||||
LOG_TRACE << "[" << blocks_.back().num() << " @ "
|
LOG_TRACE << cfg_.context << "[" << blocks_.back().num() << " @ "
|
||||||
<< frames_to_bytes(blocks_.back().size_in_frames())
|
<< frames_to_bytes(blocks_.back().size_in_frames())
|
||||||
<< ", chunkable @ " << frames_to_bytes(offset_in_frames)
|
<< ", chunkable @ " << frames_to_bytes(offset_in_frames)
|
||||||
<< "] found " << matches.size()
|
<< "] found " << matches.size()
|
||||||
@ -1072,13 +1077,15 @@ void segmenter_<LoggerPolicy, SegmentingPolicy>::segment_and_add_data(
|
|||||||
<< ", window size=" << window_size_ << ")";
|
<< ", window size=" << window_size_ << ")";
|
||||||
|
|
||||||
for (auto& m : matches) {
|
for (auto& m : matches) {
|
||||||
LOG_TRACE << " block " << m.block_num() << " @ " << m.offset();
|
LOG_TRACE << cfg_.context << " block " << m.block_num() << " @ "
|
||||||
|
<< m.offset();
|
||||||
// m.verify_and_extend(p + offset_in_frames - window_size_,
|
// m.verify_and_extend(p + offset_in_frames - window_size_,
|
||||||
// window_size_,
|
// window_size_,
|
||||||
// p + frames_written, p + size_in_frames);
|
// p + frames_written, p + size_in_frames);
|
||||||
m.verify_and_extend(data, offset_in_frames - window_size_,
|
m.verify_and_extend(data, offset_in_frames - window_size_,
|
||||||
window_size_, frames_written, size_in_frames);
|
window_size_, frames_written, size_in_frames);
|
||||||
LOG_TRACE << " -> " << m.offset() << " -> " << m.size();
|
LOG_TRACE << cfg_.context << " -> " << m.offset() << " -> "
|
||||||
|
<< m.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
stats_.total_matches += matches.size();
|
stats_.total_matches += matches.size();
|
||||||
@ -1091,8 +1098,8 @@ void segmenter_<LoggerPolicy, SegmentingPolicy>::segment_and_add_data(
|
|||||||
|
|
||||||
if (match_len > 0) {
|
if (match_len > 0) {
|
||||||
++stats_.good_matches;
|
++stats_.good_matches;
|
||||||
LOG_TRACE << "successful match of length " << match_len << " @ "
|
LOG_TRACE << cfg_.context << "successful match of length "
|
||||||
<< best->offset();
|
<< match_len << " @ " << best->offset();
|
||||||
|
|
||||||
auto block_num = best->block_num();
|
auto block_num = best->block_num();
|
||||||
auto match_off = best->offset();
|
auto match_off = best->offset();
|
||||||
|
@ -20,15 +20,18 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include "dwarfs/segmenter_factory.h"
|
#include "dwarfs/segmenter_factory.h"
|
||||||
|
#include "dwarfs/categorizer.h"
|
||||||
|
|
||||||
namespace dwarfs {
|
namespace dwarfs {
|
||||||
|
|
||||||
class segmenter_factory_ final : public segmenter_factory::impl {
|
class segmenter_factory_ final : public segmenter_factory::impl {
|
||||||
public:
|
public:
|
||||||
segmenter_factory_(logger& lgr, progress& prog,
|
segmenter_factory_(logger& lgr, progress& prog,
|
||||||
|
std::shared_ptr<categorizer_manager> catmgr,
|
||||||
const segmenter_factory::config& cfg)
|
const segmenter_factory::config& cfg)
|
||||||
: lgr_{lgr}
|
: lgr_{lgr}
|
||||||
, prog_{prog}
|
, prog_{prog}
|
||||||
|
, catmgr_{catmgr}
|
||||||
, cfg_{cfg} {}
|
, cfg_{cfg} {}
|
||||||
|
|
||||||
segmenter create(fragment_category cat, compression_constraints const& cc,
|
segmenter create(fragment_category cat, compression_constraints const& cc,
|
||||||
@ -36,6 +39,10 @@ class segmenter_factory_ final : public segmenter_factory::impl {
|
|||||||
segmenter::block_ready_cb block_ready) const override {
|
segmenter::block_ready_cb block_ready) const override {
|
||||||
segmenter::config cfg;
|
segmenter::config cfg;
|
||||||
|
|
||||||
|
if (catmgr_) {
|
||||||
|
cfg.context = category_prefix(catmgr_, cat);
|
||||||
|
}
|
||||||
|
|
||||||
cfg.blockhash_window_size = cfg_.blockhash_window_size.get(cat);
|
cfg.blockhash_window_size = cfg_.blockhash_window_size.get(cat);
|
||||||
cfg.window_increment_shift = cfg_.window_increment_shift.get(cat);
|
cfg.window_increment_shift = cfg_.window_increment_shift.get(cat);
|
||||||
cfg.max_active_blocks = cfg_.max_active_blocks.get(cat);
|
cfg.max_active_blocks = cfg_.max_active_blocks.get(cat);
|
||||||
@ -53,11 +60,18 @@ class segmenter_factory_ final : public segmenter_factory::impl {
|
|||||||
private:
|
private:
|
||||||
logger& lgr_;
|
logger& lgr_;
|
||||||
progress& prog_;
|
progress& prog_;
|
||||||
|
std::shared_ptr<categorizer_manager> catmgr_;
|
||||||
segmenter_factory::config cfg_;
|
segmenter_factory::config cfg_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
segmenter_factory::segmenter_factory(
|
||||||
|
logger& lgr, progress& prog, std::shared_ptr<categorizer_manager> catmgr,
|
||||||
|
config const& cfg)
|
||||||
|
: impl_(std::make_unique<segmenter_factory_>(lgr, prog, std::move(catmgr),
|
||||||
|
cfg)) {}
|
||||||
|
|
||||||
segmenter_factory::segmenter_factory(logger& lgr, progress& prog,
|
segmenter_factory::segmenter_factory(logger& lgr, progress& prog,
|
||||||
config const& cfg)
|
config const& cfg)
|
||||||
: impl_(std::make_unique<segmenter_factory_>(lgr, prog, cfg)) {}
|
: segmenter_factory(lgr, prog, nullptr, cfg) {}
|
||||||
|
|
||||||
} // namespace dwarfs
|
} // namespace dwarfs
|
||||||
|
@ -1053,9 +1053,10 @@ int mkdwarfs_main(int argc, sys_char** argv) {
|
|||||||
fsw, rw_opts);
|
fsw, rw_opts);
|
||||||
wg_compress.wait();
|
wg_compress.wait();
|
||||||
} else {
|
} else {
|
||||||
auto sf = std::make_shared<segmenter_factory>(lgr, prog, sf_config);
|
auto sf = std::make_shared<segmenter_factory>(
|
||||||
|
lgr, prog, options.inode.categorizer_mgr, sf_config);
|
||||||
|
|
||||||
scanner s(lgr, wg_scanner, sf, entry_factory::create(),
|
scanner s(lgr, wg_scanner, std::move(sf), entry_factory::create(),
|
||||||
std::make_shared<os_access_generic>(), std::move(script),
|
std::make_shared<os_access_generic>(), std::move(script),
|
||||||
options);
|
options);
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user