mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-13 06:16:55 -04:00
Block manager cleanup & statistics
This commit is contained in:
parent
425c9d68e9
commit
0da750c143
@ -50,18 +50,12 @@ class block_manager {
|
|||||||
|
|
||||||
void finish_blocks() { impl_->finish_blocks(); }
|
void finish_blocks() { impl_->finish_blocks(); }
|
||||||
|
|
||||||
size_t total_size() const { return impl_->total_size(); }
|
|
||||||
|
|
||||||
size_t total_blocks() const { return impl_->total_blocks(); }
|
|
||||||
|
|
||||||
class impl {
|
class impl {
|
||||||
public:
|
public:
|
||||||
virtual ~impl() = default;
|
virtual ~impl() = default;
|
||||||
|
|
||||||
virtual void add_inode(std::shared_ptr<inode> ino) = 0;
|
virtual void add_inode(std::shared_ptr<inode> ino) = 0;
|
||||||
virtual void finish_blocks() = 0;
|
virtual void finish_blocks() = 0;
|
||||||
virtual size_t total_size() const = 0;
|
|
||||||
virtual size_t total_blocks() const = 0;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -33,6 +33,8 @@
|
|||||||
|
|
||||||
#include <sparsehash/dense_hash_map>
|
#include <sparsehash/dense_hash_map>
|
||||||
|
|
||||||
|
#include <folly/stats/Histogram.h>
|
||||||
|
|
||||||
#include "dwarfs/block_data.h"
|
#include "dwarfs/block_data.h"
|
||||||
#include "dwarfs/block_manager.h"
|
#include "dwarfs/block_manager.h"
|
||||||
#include "dwarfs/compiler.h"
|
#include "dwarfs/compiler.h"
|
||||||
@ -72,6 +74,18 @@ namespace dwarfs {
|
|||||||
* configurable.
|
* configurable.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
struct bm_stats {
|
||||||
|
bm_stats()
|
||||||
|
: l2_collision_vec_size(1, 0, 128) {}
|
||||||
|
|
||||||
|
size_t total_hashes{0};
|
||||||
|
size_t l2_collisions{0};
|
||||||
|
size_t total_matches{0};
|
||||||
|
size_t good_matches{0};
|
||||||
|
size_t bad_matches{0};
|
||||||
|
folly::Histogram<size_t> l2_collision_vec_size;
|
||||||
|
};
|
||||||
|
|
||||||
template <typename KeyT, typename ValT, KeyT EmptyKey = KeyT{},
|
template <typename KeyT, typename ValT, KeyT EmptyKey = KeyT{},
|
||||||
size_t MaxCollInline = 2>
|
size_t MaxCollInline = 2>
|
||||||
class fast_multimap {
|
class fast_multimap {
|
||||||
@ -106,6 +120,9 @@ class fast_multimap {
|
|||||||
collisions_.clear();
|
collisions_.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
blockhash_t const& values() const { return values_; };
|
||||||
|
collision_t const& collisions() const { return collisions_; };
|
||||||
|
|
||||||
private:
|
private:
|
||||||
blockhash_t values_;
|
blockhash_t values_;
|
||||||
collision_t collisions_;
|
collision_t collisions_;
|
||||||
@ -145,10 +162,21 @@ class active_block {
|
|||||||
offsets_.for_each_value(key, std::forward<F>(func));
|
offsets_.for_each_value(key, std::forward<F>(func));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void finalize(bm_stats& stats) {
|
||||||
|
stats.total_hashes += offsets_.values().size();
|
||||||
|
for (auto& c : offsets_.collisions()) {
|
||||||
|
stats.total_hashes += c.second.size();
|
||||||
|
stats.l2_collisions += c.second.size() - 1;
|
||||||
|
stats.l2_collision_vec_size.addValue(c.second.size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
static constexpr size_t num_inline_offsets = 4;
|
||||||
|
|
||||||
size_t num_, capacity_, window_size_, window_step_mask_;
|
size_t num_, capacity_, window_size_, window_step_mask_;
|
||||||
rsync_hash hasher_;
|
rsync_hash hasher_;
|
||||||
fast_multimap<hash_t, offset_t> offsets_;
|
fast_multimap<hash_t, offset_t, num_inline_offsets> offsets_;
|
||||||
std::shared_ptr<block_data> data_;
|
std::shared_ptr<block_data> data_;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -171,9 +199,6 @@ class block_manager_ : public block_manager::impl {
|
|||||||
void add_inode(std::shared_ptr<inode> ino) override;
|
void add_inode(std::shared_ptr<inode> ino) override;
|
||||||
void finish_blocks() override;
|
void finish_blocks() override;
|
||||||
|
|
||||||
size_t total_size() const override { return 0; } // TODO
|
|
||||||
size_t total_blocks() const override { return 0; } // TODO
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
struct chunk_state {
|
struct chunk_state {
|
||||||
size_t offset{0};
|
size_t offset{0};
|
||||||
@ -199,6 +224,8 @@ class block_manager_ : public block_manager::impl {
|
|||||||
|
|
||||||
chunk_state chunk_;
|
chunk_state chunk_;
|
||||||
|
|
||||||
|
bm_stats stats_;
|
||||||
|
|
||||||
// Active blocks are blocks that can still be referenced from new chunks.
|
// Active blocks are blocks that can still be referenced from new chunks.
|
||||||
// Up to N blocks (configurable) can be active and are kept in this queue.
|
// Up to N blocks (configurable) can be active and are kept in this queue.
|
||||||
// All active blocks except for the last one are immutable and potentially
|
// All active blocks except for the last one are immutable and potentially
|
||||||
@ -293,7 +320,7 @@ void block_manager_<LoggerPolicy>::add_inode(std::shared_ptr<inode> ino) {
|
|||||||
<< "] - size: " << size;
|
<< "] - size: " << size;
|
||||||
|
|
||||||
if (window_size_ == 0 or size < window_size_) {
|
if (window_size_ == 0 or size < window_size_) {
|
||||||
// no point dealing with hashes, just write it out
|
// no point dealing with hashing, just write it out
|
||||||
add_data(*ino, *mm, 0, size);
|
add_data(*ino, *mm, 0, size);
|
||||||
finish_chunk(*ino);
|
finish_chunk(*ino);
|
||||||
} else {
|
} else {
|
||||||
@ -307,11 +334,36 @@ void block_manager_<LoggerPolicy>::finish_blocks() {
|
|||||||
if (!blocks_.empty()) {
|
if (!blocks_.empty()) {
|
||||||
block_ready();
|
block_ready();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
auto l1_collisions = stats_.l2_collision_vec_size.computeTotalCount();
|
||||||
|
|
||||||
|
LOG_INFO << "segmentation matches: good=" << stats_.good_matches
|
||||||
|
<< ", bad=" << stats_.bad_matches
|
||||||
|
<< ", total=" << stats_.total_matches;
|
||||||
|
LOG_INFO << "segmentation collisions: L1="
|
||||||
|
<< fmt::format("{:.3f}%",
|
||||||
|
100.0 * (l1_collisions + stats_.l2_collisions) /
|
||||||
|
stats_.total_hashes)
|
||||||
|
<< ", L2="
|
||||||
|
<< fmt::format("{:.3f}%",
|
||||||
|
100.0 * stats_.l2_collisions / stats_.total_hashes)
|
||||||
|
<< " [" << stats_.total_hashes << " hashes]";
|
||||||
|
|
||||||
|
if (l1_collisions > 0) {
|
||||||
|
auto pct = [&](double p) {
|
||||||
|
return stats_.l2_collision_vec_size.getPercentileEstimate(p);
|
||||||
|
};
|
||||||
|
LOG_DEBUG << "collision vector size p50: " << pct(0.5)
|
||||||
|
<< ", p75: " << pct(0.75) << ", p90: " << pct(0.9)
|
||||||
|
<< ", p95: " << pct(0.95) << ", p99: " << pct(0.99);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename LoggerPolicy>
|
template <typename LoggerPolicy>
|
||||||
void block_manager_<LoggerPolicy>::block_ready() {
|
void block_manager_<LoggerPolicy>::block_ready() {
|
||||||
fsw_.write_block(blocks_.back().data());
|
auto& block = blocks_.back();
|
||||||
|
block.finalize(stats_);
|
||||||
|
fsw_.write_block(block.data());
|
||||||
++prog_.block_count;
|
++prog_.block_count;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -399,22 +451,26 @@ void block_manager_<LoggerPolicy>::segment_and_add_data(inode& ino, mmif& mm,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!matches.empty()) {
|
if (!matches.empty()) {
|
||||||
// TODO: verify & extend matches, find longest match
|
|
||||||
|
|
||||||
LOG_TRACE << "found " << matches.size() << " matches (hash=" << hasher()
|
LOG_TRACE << "found " << matches.size() << " matches (hash=" << hasher()
|
||||||
<< ", window size=" << window_size_ << ")";
|
<< ", window size=" << window_size_ << ")";
|
||||||
|
|
||||||
for (auto& m : matches) {
|
for (auto& m : matches) {
|
||||||
LOG_TRACE << " @" << m.offset();
|
LOG_TRACE << " block " << m.block_num() << " @ " << m.offset();
|
||||||
m.verify_and_extend(p + offset - window_size_, window_size_,
|
m.verify_and_extend(p + offset - window_size_, window_size_,
|
||||||
p + written, p + size);
|
p + written, p + size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
stats_.total_matches += matches.size();
|
||||||
|
stats_.bad_matches +=
|
||||||
|
std::count_if(matches.begin(), matches.end(),
|
||||||
|
[](auto const& m) { return m.size() == 0; });
|
||||||
|
|
||||||
auto best = std::max_element(matches.begin(), matches.end());
|
auto best = std::max_element(matches.begin(), matches.end());
|
||||||
auto match_len = best->size();
|
auto match_len = best->size();
|
||||||
|
|
||||||
if (match_len > 0) {
|
if (match_len > 0) {
|
||||||
LOG_DEBUG << "successful match of length " << match_len << " @ "
|
++stats_.good_matches;
|
||||||
|
LOG_TRACE << "successful match of length " << match_len << " @ "
|
||||||
<< best->offset();
|
<< best->offset();
|
||||||
|
|
||||||
auto block_num = best->block_num();
|
auto block_num = best->block_num();
|
||||||
|
@ -570,19 +570,6 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
|
|||||||
});
|
});
|
||||||
prog.sync([&] { prog.current.store(nullptr); });
|
prog.sync([&] { prog.current.store(nullptr); });
|
||||||
|
|
||||||
// TODO: check this, doesn't seem to come out right in debug output
|
|
||||||
// seems to be out-of-line with block compression??
|
|
||||||
LOG_DEBUG << "compressed " << size_with_unit(bm.total_size()) << " in "
|
|
||||||
<< bm.total_blocks() << " blocks to "
|
|
||||||
<< size_with_unit(prog.compressed_size) << " (ratio="
|
|
||||||
<< (bm.total_size() ? static_cast<double>(prog.compressed_size) /
|
|
||||||
bm.total_size()
|
|
||||||
: 1.0)
|
|
||||||
<< ")";
|
|
||||||
|
|
||||||
LOG_DEBUG << "saved by segmenting: "
|
|
||||||
<< size_with_unit(prog.saved_by_segmentation);
|
|
||||||
|
|
||||||
// this is actually needed
|
// this is actually needed
|
||||||
root->set_name(std::string());
|
root->set_name(std::string());
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user