mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-13 06:16:55 -04:00
Block manager cleanup & statistics
This commit is contained in:
parent
425c9d68e9
commit
0da750c143
@ -50,18 +50,12 @@ class block_manager {
|
||||
|
||||
void finish_blocks() { impl_->finish_blocks(); }
|
||||
|
||||
size_t total_size() const { return impl_->total_size(); }
|
||||
|
||||
size_t total_blocks() const { return impl_->total_blocks(); }
|
||||
|
||||
class impl {
|
||||
public:
|
||||
virtual ~impl() = default;
|
||||
|
||||
virtual void add_inode(std::shared_ptr<inode> ino) = 0;
|
||||
virtual void finish_blocks() = 0;
|
||||
virtual size_t total_size() const = 0;
|
||||
virtual size_t total_blocks() const = 0;
|
||||
};
|
||||
|
||||
private:
|
||||
|
@ -33,6 +33,8 @@
|
||||
|
||||
#include <sparsehash/dense_hash_map>
|
||||
|
||||
#include <folly/stats/Histogram.h>
|
||||
|
||||
#include "dwarfs/block_data.h"
|
||||
#include "dwarfs/block_manager.h"
|
||||
#include "dwarfs/compiler.h"
|
||||
@ -72,6 +74,18 @@ namespace dwarfs {
|
||||
* configurable.
|
||||
*/
|
||||
|
||||
struct bm_stats {
|
||||
bm_stats()
|
||||
: l2_collision_vec_size(1, 0, 128) {}
|
||||
|
||||
size_t total_hashes{0};
|
||||
size_t l2_collisions{0};
|
||||
size_t total_matches{0};
|
||||
size_t good_matches{0};
|
||||
size_t bad_matches{0};
|
||||
folly::Histogram<size_t> l2_collision_vec_size;
|
||||
};
|
||||
|
||||
template <typename KeyT, typename ValT, KeyT EmptyKey = KeyT{},
|
||||
size_t MaxCollInline = 2>
|
||||
class fast_multimap {
|
||||
@ -106,6 +120,9 @@ class fast_multimap {
|
||||
collisions_.clear();
|
||||
}
|
||||
|
||||
blockhash_t const& values() const { return values_; };
|
||||
collision_t const& collisions() const { return collisions_; };
|
||||
|
||||
private:
|
||||
blockhash_t values_;
|
||||
collision_t collisions_;
|
||||
@ -145,10 +162,21 @@ class active_block {
|
||||
offsets_.for_each_value(key, std::forward<F>(func));
|
||||
}
|
||||
|
||||
void finalize(bm_stats& stats) {
|
||||
stats.total_hashes += offsets_.values().size();
|
||||
for (auto& c : offsets_.collisions()) {
|
||||
stats.total_hashes += c.second.size();
|
||||
stats.l2_collisions += c.second.size() - 1;
|
||||
stats.l2_collision_vec_size.addValue(c.second.size());
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
static constexpr size_t num_inline_offsets = 4;
|
||||
|
||||
size_t num_, capacity_, window_size_, window_step_mask_;
|
||||
rsync_hash hasher_;
|
||||
fast_multimap<hash_t, offset_t> offsets_;
|
||||
fast_multimap<hash_t, offset_t, num_inline_offsets> offsets_;
|
||||
std::shared_ptr<block_data> data_;
|
||||
};
|
||||
|
||||
@ -171,9 +199,6 @@ class block_manager_ : public block_manager::impl {
|
||||
void add_inode(std::shared_ptr<inode> ino) override;
|
||||
void finish_blocks() override;
|
||||
|
||||
size_t total_size() const override { return 0; } // TODO
|
||||
size_t total_blocks() const override { return 0; } // TODO
|
||||
|
||||
private:
|
||||
struct chunk_state {
|
||||
size_t offset{0};
|
||||
@ -199,6 +224,8 @@ class block_manager_ : public block_manager::impl {
|
||||
|
||||
chunk_state chunk_;
|
||||
|
||||
bm_stats stats_;
|
||||
|
||||
// Active blocks are blocks that can still be referenced from new chunks.
|
||||
// Up to N blocks (configurable) can be active and are kept in this queue.
|
||||
// All active blocks except for the last one are immutable and potentially
|
||||
@ -293,7 +320,7 @@ void block_manager_<LoggerPolicy>::add_inode(std::shared_ptr<inode> ino) {
|
||||
<< "] - size: " << size;
|
||||
|
||||
if (window_size_ == 0 or size < window_size_) {
|
||||
// no point dealing with hashes, just write it out
|
||||
// no point dealing with hashing, just write it out
|
||||
add_data(*ino, *mm, 0, size);
|
||||
finish_chunk(*ino);
|
||||
} else {
|
||||
@ -307,11 +334,36 @@ void block_manager_<LoggerPolicy>::finish_blocks() {
|
||||
if (!blocks_.empty()) {
|
||||
block_ready();
|
||||
}
|
||||
|
||||
auto l1_collisions = stats_.l2_collision_vec_size.computeTotalCount();
|
||||
|
||||
LOG_INFO << "segmentation matches: good=" << stats_.good_matches
|
||||
<< ", bad=" << stats_.bad_matches
|
||||
<< ", total=" << stats_.total_matches;
|
||||
LOG_INFO << "segmentation collisions: L1="
|
||||
<< fmt::format("{:.3f}%",
|
||||
100.0 * (l1_collisions + stats_.l2_collisions) /
|
||||
stats_.total_hashes)
|
||||
<< ", L2="
|
||||
<< fmt::format("{:.3f}%",
|
||||
100.0 * stats_.l2_collisions / stats_.total_hashes)
|
||||
<< " [" << stats_.total_hashes << " hashes]";
|
||||
|
||||
if (l1_collisions > 0) {
|
||||
auto pct = [&](double p) {
|
||||
return stats_.l2_collision_vec_size.getPercentileEstimate(p);
|
||||
};
|
||||
LOG_DEBUG << "collision vector size p50: " << pct(0.5)
|
||||
<< ", p75: " << pct(0.75) << ", p90: " << pct(0.9)
|
||||
<< ", p95: " << pct(0.95) << ", p99: " << pct(0.99);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
void block_manager_<LoggerPolicy>::block_ready() {
|
||||
fsw_.write_block(blocks_.back().data());
|
||||
auto& block = blocks_.back();
|
||||
block.finalize(stats_);
|
||||
fsw_.write_block(block.data());
|
||||
++prog_.block_count;
|
||||
}
|
||||
|
||||
@ -399,22 +451,26 @@ void block_manager_<LoggerPolicy>::segment_and_add_data(inode& ino, mmif& mm,
|
||||
}
|
||||
|
||||
if (!matches.empty()) {
|
||||
// TODO: verify & extend matches, find longest match
|
||||
|
||||
LOG_TRACE << "found " << matches.size() << " matches (hash=" << hasher()
|
||||
<< ", window size=" << window_size_ << ")";
|
||||
|
||||
for (auto& m : matches) {
|
||||
LOG_TRACE << " @" << m.offset();
|
||||
LOG_TRACE << " block " << m.block_num() << " @ " << m.offset();
|
||||
m.verify_and_extend(p + offset - window_size_, window_size_,
|
||||
p + written, p + size);
|
||||
}
|
||||
|
||||
stats_.total_matches += matches.size();
|
||||
stats_.bad_matches +=
|
||||
std::count_if(matches.begin(), matches.end(),
|
||||
[](auto const& m) { return m.size() == 0; });
|
||||
|
||||
auto best = std::max_element(matches.begin(), matches.end());
|
||||
auto match_len = best->size();
|
||||
|
||||
if (match_len > 0) {
|
||||
LOG_DEBUG << "successful match of length " << match_len << " @ "
|
||||
++stats_.good_matches;
|
||||
LOG_TRACE << "successful match of length " << match_len << " @ "
|
||||
<< best->offset();
|
||||
|
||||
auto block_num = best->block_num();
|
||||
|
@ -570,19 +570,6 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
||||
});
|
||||
prog.sync([&] { prog.current.store(nullptr); });
|
||||
|
||||
// TODO: check this, doesn't seem to come out right in debug output
|
||||
// seems to be out-of-line with block compression??
|
||||
LOG_DEBUG << "compressed " << size_with_unit(bm.total_size()) << " in "
|
||||
<< bm.total_blocks() << " blocks to "
|
||||
<< size_with_unit(prog.compressed_size) << " (ratio="
|
||||
<< (bm.total_size() ? static_cast<double>(prog.compressed_size) /
|
||||
bm.total_size()
|
||||
: 1.0)
|
||||
<< ")";
|
||||
|
||||
LOG_DEBUG << "saved by segmenting: "
|
||||
<< size_with_unit(prog.saved_by_segmentation);
|
||||
|
||||
// this is actually needed
|
||||
root->set_name(std::string());
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user