diff --git a/include/dwarfs/segmenter.h b/include/dwarfs/segmenter.h index 04bbc228..7bd29b7a 100644 --- a/include/dwarfs/segmenter.h +++ b/include/dwarfs/segmenter.h @@ -40,6 +40,7 @@ struct compression_constraints; class segmenter { public: struct config { + std::string context; unsigned blockhash_window_size{12}; unsigned window_increment_shift{1}; size_t max_active_blocks{1}; diff --git a/include/dwarfs/segmenter_factory.h b/include/dwarfs/segmenter_factory.h index 4f1069bf..e02a3acc 100644 --- a/include/dwarfs/segmenter_factory.h +++ b/include/dwarfs/segmenter_factory.h @@ -28,6 +28,7 @@ namespace dwarfs { +class categorizer_manager; class logger; class progress; @@ -43,6 +44,10 @@ class segmenter_factory { unsigned block_size_bits{22}; }; + segmenter_factory(logger& lgr, progress& prog, + std::shared_ptr catmgr, + config const& cfg); + segmenter_factory(logger& lgr, progress& prog, config const& cfg); segmenter create(fragment_category cat, compression_constraints const& cc, diff --git a/src/dwarfs/segmenter.cpp b/src/dwarfs/segmenter.cpp index cf293820..efc62659 100644 --- a/src/dwarfs/segmenter.cpp +++ b/src/dwarfs/segmenter.cpp @@ -619,10 +619,11 @@ class segmenter_ final : public segmenter::impl, private SegmentingPolicy { , global_filter_{bloom_filter_size(cfg)} , match_counts_{1, 0, 128} { if constexpr (is_segmentation_enabled()) { - LOG_INFO << "using a " << size_with_unit(window_size_) << " window at " - << size_with_unit(window_step_) << " steps for segment analysis"; - LOG_INFO << "bloom filter size: " - << size_with_unit(global_filter_.size() / 8); + LOG_VERBOSE << cfg_.context << "using a " << size_with_unit(window_size_) + << " window at " << size_with_unit(window_step_) + << " steps for segment analysis"; + LOG_VERBOSE << cfg_.context << "bloom filter size: " + << size_with_unit(global_filter_.size() / 8); repeating_sequence_hash_values_.reserve(256); @@ -851,7 +852,7 @@ void segmenter_::add_chunkable( chunkable& chkable) { if (auto size_in_frames = bytes_to_frames(chkable.size()); size_in_frames > 0) { - LOG_TRACE << "adding " << chkable.description(); + LOG_TRACE << cfg_.context << "adding " << chkable.description(); if (!is_segmentation_enabled() or size_in_frames < window_size_) { // no point dealing with hashing, just write it out @@ -872,50 +873,53 @@ void segmenter_::finish() { auto l1_collisions = stats_.l2_collision_vec_size.computeTotalCount(); if (stats_.bloom_lookups > 0) { - LOG_INFO << "bloom filter reject rate: " - << fmt::format("{:.3f}%", 100.0 - 100.0 * stats_.bloom_hits / - stats_.bloom_lookups) - << " (TPR=" - << fmt::format("{:.3f}%", 100.0 * stats_.bloom_true_positives / - stats_.bloom_hits) - << ", lookups=" << stats_.bloom_lookups << ")"; + LOG_VERBOSE << cfg_.context << "bloom filter reject rate: " + << fmt::format("{:.3f}%", 100.0 - 100.0 * stats_.bloom_hits / + stats_.bloom_lookups) + << " (TPR=" + << fmt::format("{:.3f}%", 100.0 * stats_.bloom_true_positives / + stats_.bloom_hits) + << ", lookups=" << stats_.bloom_lookups << ")"; } if (stats_.total_matches > 0) { - LOG_INFO << "segmentation matches: good=" << stats_.good_matches - << ", bad=" << stats_.bad_matches << ", collisions=" - << (stats_.total_matches - - (stats_.bad_matches + stats_.good_matches)) - << ", total=" << stats_.total_matches; + LOG_VERBOSE << cfg_.context + << "segmentation matches: good=" << stats_.good_matches + << ", bad=" << stats_.bad_matches << ", collisions=" + << (stats_.total_matches - + (stats_.bad_matches + stats_.good_matches)) + << ", total=" << stats_.total_matches; } if (stats_.total_hashes > 0) { - LOG_INFO << "segmentation collisions: L1=" - << fmt::format("{:.3f}%", - 100.0 * (l1_collisions + stats_.l2_collisions) / - stats_.total_hashes) - << ", L2=" - << fmt::format("{:.3f}%", - 100.0 * stats_.l2_collisions / stats_.total_hashes) - << " [" << stats_.total_hashes << " hashes]"; + LOG_VERBOSE << cfg_.context << "segmentation collisions: L1=" + << fmt::format("{:.3f}%", + 100.0 * (l1_collisions + stats_.l2_collisions) / + stats_.total_hashes) + << ", L2=" + << fmt::format("{:.3f}%", 100.0 * stats_.l2_collisions / + stats_.total_hashes) + << " [" << stats_.total_hashes << " hashes]"; } if (l1_collisions > 0) { auto pct = [&](double p) { return stats_.l2_collision_vec_size.getPercentileEstimate(p); }; - LOG_DEBUG << "collision vector size p50: " << pct(0.5) + LOG_DEBUG << cfg_.context << "collision vector size p50: " << pct(0.5) << ", p75: " << pct(0.75) << ", p90: " << pct(0.9) << ", p95: " << pct(0.95) << ", p99: " << pct(0.99); } auto pct = [&](double p) { return match_counts_.getPercentileEstimate(p); }; - LOG_DEBUG << "match counts p50: " << pct(0.5) << ", p75: " << pct(0.75) - << ", p90: " << pct(0.9) << ", p95: " << pct(0.95) - << ", p99: " << pct(0.99); + LOG_DEBUG << cfg_.context << "match counts p50: " << pct(0.5) + << ", p75: " << pct(0.75) << ", p90: " << pct(0.9) + << ", p95: " << pct(0.95) << ", p99: " << pct(0.99); for (auto [k, v] : repeating_collisions_) { - LOG_INFO << fmt::format("avoided {} collisions in 0x{:02x}-byte sequences", - v, k); + LOG_VERBOSE << cfg_.context + << fmt::format( + "avoided {} collisions in 0x{:02x}-byte sequences", v, + k); } } @@ -952,8 +956,9 @@ void segmenter_::append_to_block( auto const size_in_bytes = frames_to_bytes(size_in_frames); auto& block = blocks_.back(); - LOG_TRACE << "appending " << size_in_bytes << " bytes to block " - << block.num() << " @ " << frames_to_bytes(block.size_in_frames()) + LOG_TRACE << cfg_.context << "appending " << size_in_bytes + << " bytes to block " << block.num() << " @ " + << frames_to_bytes(block.size_in_frames()) << " from chunkable offset " << offset_in_bytes; block.append_bytes(chkable.span().subspan(offset_in_bytes, size_in_bytes), @@ -1064,7 +1069,7 @@ void segmenter_::segment_and_add_data( ++stats_.bloom_true_positives; match_counts_.addValue(matches.size()); - LOG_TRACE << "[" << blocks_.back().num() << " @ " + LOG_TRACE << cfg_.context << "[" << blocks_.back().num() << " @ " << frames_to_bytes(blocks_.back().size_in_frames()) << ", chunkable @ " << frames_to_bytes(offset_in_frames) << "] found " << matches.size() @@ -1072,13 +1077,15 @@ void segmenter_::segment_and_add_data( << ", window size=" << window_size_ << ")"; for (auto& m : matches) { - LOG_TRACE << " block " << m.block_num() << " @ " << m.offset(); + LOG_TRACE << cfg_.context << " block " << m.block_num() << " @ " + << m.offset(); // m.verify_and_extend(p + offset_in_frames - window_size_, // window_size_, // p + frames_written, p + size_in_frames); m.verify_and_extend(data, offset_in_frames - window_size_, window_size_, frames_written, size_in_frames); - LOG_TRACE << " -> " << m.offset() << " -> " << m.size(); + LOG_TRACE << cfg_.context << " -> " << m.offset() << " -> " + << m.size(); } stats_.total_matches += matches.size(); @@ -1091,8 +1098,8 @@ void segmenter_::segment_and_add_data( if (match_len > 0) { ++stats_.good_matches; - LOG_TRACE << "successful match of length " << match_len << " @ " - << best->offset(); + LOG_TRACE << cfg_.context << "successful match of length " + << match_len << " @ " << best->offset(); auto block_num = best->block_num(); auto match_off = best->offset(); diff --git a/src/dwarfs/segmenter_factory.cpp b/src/dwarfs/segmenter_factory.cpp index 37c844f8..ba36c9c4 100644 --- a/src/dwarfs/segmenter_factory.cpp +++ b/src/dwarfs/segmenter_factory.cpp @@ -20,15 +20,18 @@ */ #include "dwarfs/segmenter_factory.h" +#include "dwarfs/categorizer.h" namespace dwarfs { class segmenter_factory_ final : public segmenter_factory::impl { public: segmenter_factory_(logger& lgr, progress& prog, + std::shared_ptr catmgr, const segmenter_factory::config& cfg) : lgr_{lgr} , prog_{prog} + , catmgr_{catmgr} , cfg_{cfg} {} segmenter create(fragment_category cat, compression_constraints const& cc, @@ -36,6 +39,10 @@ class segmenter_factory_ final : public segmenter_factory::impl { segmenter::block_ready_cb block_ready) const override { segmenter::config cfg; + if (catmgr_) { + cfg.context = category_prefix(catmgr_, cat); + } + cfg.blockhash_window_size = cfg_.blockhash_window_size.get(cat); cfg.window_increment_shift = cfg_.window_increment_shift.get(cat); cfg.max_active_blocks = cfg_.max_active_blocks.get(cat); @@ -53,11 +60,18 @@ class segmenter_factory_ final : public segmenter_factory::impl { private: logger& lgr_; progress& prog_; + std::shared_ptr catmgr_; segmenter_factory::config cfg_; }; +segmenter_factory::segmenter_factory( + logger& lgr, progress& prog, std::shared_ptr catmgr, + config const& cfg) + : impl_(std::make_unique(lgr, prog, std::move(catmgr), + cfg)) {} + segmenter_factory::segmenter_factory(logger& lgr, progress& prog, config const& cfg) - : impl_(std::make_unique(lgr, prog, cfg)) {} + : segmenter_factory(lgr, prog, nullptr, cfg) {} } // namespace dwarfs diff --git a/src/mkdwarfs_main.cpp b/src/mkdwarfs_main.cpp index fd29a111..3b0ba216 100644 --- a/src/mkdwarfs_main.cpp +++ b/src/mkdwarfs_main.cpp @@ -1053,9 +1053,10 @@ int mkdwarfs_main(int argc, sys_char** argv) { fsw, rw_opts); wg_compress.wait(); } else { - auto sf = std::make_shared(lgr, prog, sf_config); + auto sf = std::make_shared( + lgr, prog, options.inode.categorizer_mgr, sf_config); - scanner s(lgr, wg_scanner, sf, entry_factory::create(), + scanner s(lgr, wg_scanner, std::move(sf), entry_factory::create(), std::make_shared(), std::move(script), options);