diff --git a/include/dwarfs/block_manager.h b/include/dwarfs/block_manager.h index ba9c9e78..2e29340b 100644 --- a/include/dwarfs/block_manager.h +++ b/include/dwarfs/block_manager.h @@ -50,18 +50,12 @@ class block_manager { void finish_blocks() { impl_->finish_blocks(); } - size_t total_size() const { return impl_->total_size(); } - - size_t total_blocks() const { return impl_->total_blocks(); } - class impl { public: virtual ~impl() = default; virtual void add_inode(std::shared_ptr ino) = 0; virtual void finish_blocks() = 0; - virtual size_t total_size() const = 0; - virtual size_t total_blocks() const = 0; }; private: diff --git a/src/dwarfs/block_manager_v2.cpp b/src/dwarfs/block_manager_v2.cpp index 58345b3d..c43a1400 100644 --- a/src/dwarfs/block_manager_v2.cpp +++ b/src/dwarfs/block_manager_v2.cpp @@ -33,6 +33,8 @@ #include +#include + #include "dwarfs/block_data.h" #include "dwarfs/block_manager.h" #include "dwarfs/compiler.h" @@ -72,6 +74,18 @@ namespace dwarfs { * configurable. */ +struct bm_stats { + bm_stats() + : l2_collision_vec_size(1, 0, 128) {} + + size_t total_hashes{0}; + size_t l2_collisions{0}; + size_t total_matches{0}; + size_t good_matches{0}; + size_t bad_matches{0}; + folly::Histogram l2_collision_vec_size; +}; + template class fast_multimap { @@ -106,6 +120,9 @@ class fast_multimap { collisions_.clear(); } + blockhash_t const& values() const { return values_; }; + collision_t const& collisions() const { return collisions_; }; + private: blockhash_t values_; collision_t collisions_; @@ -145,10 +162,21 @@ class active_block { offsets_.for_each_value(key, std::forward(func)); } + void finalize(bm_stats& stats) { + stats.total_hashes += offsets_.values().size(); + for (auto& c : offsets_.collisions()) { + stats.total_hashes += c.second.size(); + stats.l2_collisions += c.second.size() - 1; + stats.l2_collision_vec_size.addValue(c.second.size()); + } + } + private: + static constexpr size_t num_inline_offsets = 4; + size_t num_, capacity_, window_size_, window_step_mask_; rsync_hash hasher_; - fast_multimap offsets_; + fast_multimap offsets_; std::shared_ptr data_; }; @@ -171,9 +199,6 @@ class block_manager_ : public block_manager::impl { void add_inode(std::shared_ptr ino) override; void finish_blocks() override; - size_t total_size() const override { return 0; } // TODO - size_t total_blocks() const override { return 0; } // TODO - private: struct chunk_state { size_t offset{0}; @@ -199,6 +224,8 @@ class block_manager_ : public block_manager::impl { chunk_state chunk_; + bm_stats stats_; + // Active blocks are blocks that can still be referenced from new chunks. // Up to N blocks (configurable) can be active and are kept in this queue. // All active blocks except for the last one are immutable and potentially @@ -293,7 +320,7 @@ void block_manager_::add_inode(std::shared_ptr ino) { << "] - size: " << size; if (window_size_ == 0 or size < window_size_) { - // no point dealing with hashes, just write it out + // no point dealing with hashing, just write it out add_data(*ino, *mm, 0, size); finish_chunk(*ino); } else { @@ -307,11 +334,36 @@ void block_manager_::finish_blocks() { if (!blocks_.empty()) { block_ready(); } + + auto l1_collisions = stats_.l2_collision_vec_size.computeTotalCount(); + + LOG_INFO << "segmentation matches: good=" << stats_.good_matches + << ", bad=" << stats_.bad_matches + << ", total=" << stats_.total_matches; + LOG_INFO << "segmentation collisions: L1=" + << fmt::format("{:.3f}%", + 100.0 * (l1_collisions + stats_.l2_collisions) / + stats_.total_hashes) + << ", L2=" + << fmt::format("{:.3f}%", + 100.0 * stats_.l2_collisions / stats_.total_hashes) + << " [" << stats_.total_hashes << " hashes]"; + + if (l1_collisions > 0) { + auto pct = [&](double p) { + return stats_.l2_collision_vec_size.getPercentileEstimate(p); + }; + LOG_DEBUG << "collision vector size p50: " << pct(0.5) + << ", p75: " << pct(0.75) << ", p90: " << pct(0.9) + << ", p95: " << pct(0.95) << ", p99: " << pct(0.99); + } } template void block_manager_::block_ready() { - fsw_.write_block(blocks_.back().data()); + auto& block = blocks_.back(); + block.finalize(stats_); + fsw_.write_block(block.data()); ++prog_.block_count; } @@ -399,22 +451,26 @@ void block_manager_::segment_and_add_data(inode& ino, mmif& mm, } if (!matches.empty()) { - // TODO: verify & extend matches, find longest match - LOG_TRACE << "found " << matches.size() << " matches (hash=" << hasher() << ", window size=" << window_size_ << ")"; for (auto& m : matches) { - LOG_TRACE << " @" << m.offset(); + LOG_TRACE << " block " << m.block_num() << " @ " << m.offset(); m.verify_and_extend(p + offset - window_size_, window_size_, p + written, p + size); } + stats_.total_matches += matches.size(); + stats_.bad_matches += + std::count_if(matches.begin(), matches.end(), + [](auto const& m) { return m.size() == 0; }); + auto best = std::max_element(matches.begin(), matches.end()); auto match_len = best->size(); if (match_len > 0) { - LOG_DEBUG << "successful match of length " << match_len << " @ " + ++stats_.good_matches; + LOG_TRACE << "successful match of length " << match_len << " @ " << best->offset(); auto block_num = best->block_num(); diff --git a/src/dwarfs/scanner.cpp b/src/dwarfs/scanner.cpp index 5aea9e18..2ddeb5ae 100644 --- a/src/dwarfs/scanner.cpp +++ b/src/dwarfs/scanner.cpp @@ -570,19 +570,6 @@ void scanner_::scan(filesystem_writer& fsw, }); prog.sync([&] { prog.current.store(nullptr); }); - // TODO: check this, doesn't seem to come out right in debug output - // seems to be out-of-line with block compression?? - LOG_DEBUG << "compressed " << size_with_unit(bm.total_size()) << " in " - << bm.total_blocks() << " blocks to " - << size_with_unit(prog.compressed_size) << " (ratio=" - << (bm.total_size() ? static_cast(prog.compressed_size) / - bm.total_size() - : 1.0) - << ")"; - - LOG_DEBUG << "saved by segmenting: " - << size_with_unit(prog.saved_by_segmentation); - // this is actually needed root->set_name(std::string());