mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-15 15:26:19 -04:00
Add bloom filter to speed up block manager
This commit is contained in:
parent
35ea333f85
commit
3a78ab9ce4
@ -41,6 +41,7 @@ class block_manager {
|
|||||||
size_t max_active_blocks{1};
|
size_t max_active_blocks{1};
|
||||||
size_t memory_limit{256 << 20};
|
size_t memory_limit{256 << 20};
|
||||||
unsigned block_size_bits{22};
|
unsigned block_size_bits{22};
|
||||||
|
unsigned bloom_filter_size{4};
|
||||||
};
|
};
|
||||||
|
|
||||||
block_manager(logger& lgr, progress& prog, const config& cfg,
|
block_manager(logger& lgr, progress& prog, const config& cfg,
|
||||||
|
@ -32,6 +32,7 @@
|
|||||||
|
|
||||||
#include <parallel_hashmap/phmap.h>
|
#include <parallel_hashmap/phmap.h>
|
||||||
|
|
||||||
|
#include <folly/hash/Hash.h>
|
||||||
#include <folly/small_vector.h>
|
#include <folly/small_vector.h>
|
||||||
#include <folly/stats/Histogram.h>
|
#include <folly/stats/Histogram.h>
|
||||||
|
|
||||||
@ -83,6 +84,9 @@ struct bm_stats {
|
|||||||
size_t total_matches{0};
|
size_t total_matches{0};
|
||||||
size_t good_matches{0};
|
size_t good_matches{0};
|
||||||
size_t bad_matches{0};
|
size_t bad_matches{0};
|
||||||
|
size_t bloom_lookups{0};
|
||||||
|
size_t bloom_hits{0};
|
||||||
|
size_t bloom_true_positives{0};
|
||||||
folly::Histogram<size_t> l2_collision_vec_size;
|
folly::Histogram<size_t> l2_collision_vec_size;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -126,17 +130,91 @@ class fast_multimap {
|
|||||||
collision_t collisions_;
|
collision_t collisions_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
constexpr unsigned bitcount(unsigned n) {
|
||||||
|
return n > 0 ? (n & 1) + bitcount(n >> 1) : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr uint64_t pow2ceil(uint64_t n) {
|
||||||
|
n--;
|
||||||
|
n |= n >> 1;
|
||||||
|
n |= n >> 2;
|
||||||
|
n |= n >> 4;
|
||||||
|
n |= n >> 8;
|
||||||
|
n |= n >> 16;
|
||||||
|
n |= n >> 32;
|
||||||
|
n++;
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A very simple bloom filter. This is not generalized at all and highly
|
||||||
|
* optimized for the cyclic hash use case.
|
||||||
|
*
|
||||||
|
* - Since we're already using a hash value, there's no need to hash the
|
||||||
|
* value before accessing the bloom filter bit field.
|
||||||
|
*
|
||||||
|
* - We can accept a high false positive rate as the secondary lookup
|
||||||
|
* is not very expensive. However, the bloom filter lookup must be
|
||||||
|
* extremely cheap, so we can't afford e.g. using two hashes instead
|
||||||
|
* of one.
|
||||||
|
*/
|
||||||
|
class bloom_filter {
|
||||||
|
public:
|
||||||
|
using bits_type = uint64_t;
|
||||||
|
|
||||||
|
static constexpr size_t value_mask = 8 * sizeof(bits_type) - 1;
|
||||||
|
static constexpr size_t index_shift = bitcount(value_mask);
|
||||||
|
|
||||||
|
bloom_filter(size_t size)
|
||||||
|
: index_mask_{(size >> index_shift) - 1} {
|
||||||
|
if (size & (size - 1)) {
|
||||||
|
throw std::runtime_error("size must be a power of two");
|
||||||
|
}
|
||||||
|
bits_.resize(size >> index_shift);
|
||||||
|
}
|
||||||
|
|
||||||
|
void add(size_t val) { set(val); }
|
||||||
|
|
||||||
|
bool test(size_t val) const { return isset(val); }
|
||||||
|
|
||||||
|
size_t size() const { return bits_.size() << index_shift; }
|
||||||
|
|
||||||
|
void clear() { std::fill(bits_.begin(), bits_.end(), 0); }
|
||||||
|
|
||||||
|
void merge(bloom_filter const& other) {
|
||||||
|
if (bits_.size() != other.bits_.size()) {
|
||||||
|
throw std::runtime_error("size mismatch");
|
||||||
|
}
|
||||||
|
std::transform(bits_.cbegin(), bits_.cend(), other.bits_.cbegin(),
|
||||||
|
bits_.begin(), std::bit_or<>{});
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
void set(size_t ix) {
|
||||||
|
bits_[(ix >> index_shift) & index_mask_] |= 1 << (ix & value_mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool isset(size_t ix) const {
|
||||||
|
return bits_[(ix >> index_shift) & index_mask_] & (1 << (ix & value_mask));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<bits_type> bits_;
|
||||||
|
size_t const index_mask_;
|
||||||
|
};
|
||||||
|
|
||||||
class active_block {
|
class active_block {
|
||||||
private:
|
private:
|
||||||
using offset_t = uint32_t;
|
using offset_t = uint32_t;
|
||||||
using hash_t = uint32_t;
|
using hash_t = uint32_t;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
active_block(size_t num, size_t size, size_t window_size, size_t window_step)
|
active_block(size_t num, size_t size, size_t window_size, size_t window_step,
|
||||||
|
size_t bloom_filter_size)
|
||||||
: num_(num)
|
: num_(num)
|
||||||
, capacity_(size)
|
, capacity_(size)
|
||||||
, window_size_(window_size)
|
, window_size_(window_size)
|
||||||
, window_step_mask_(window_step - 1)
|
, window_step_mask_(window_step - 1)
|
||||||
|
, filter_(bloom_filter_size)
|
||||||
, data_{std::make_shared<block_data>()} {
|
, data_{std::make_shared<block_data>()} {
|
||||||
DWARFS_CHECK((window_step & window_step_mask_) == 0,
|
DWARFS_CHECK((window_step & window_step_mask_) == 0,
|
||||||
"window step size not a power of two");
|
"window step size not a power of two");
|
||||||
@ -149,7 +227,7 @@ class active_block {
|
|||||||
bool full() const { return size() == capacity_; }
|
bool full() const { return size() == capacity_; }
|
||||||
std::shared_ptr<block_data> data() const { return data_; }
|
std::shared_ptr<block_data> data() const { return data_; }
|
||||||
|
|
||||||
void append(uint8_t const* p, size_t size);
|
void append(uint8_t const* p, size_t size, bloom_filter* filter);
|
||||||
|
|
||||||
size_t next_hash_distance() const {
|
size_t next_hash_distance() const {
|
||||||
return window_step_mask_ + 1 - (data_->vec().size() & window_step_mask_);
|
return window_step_mask_ + 1 - (data_->vec().size() & window_step_mask_);
|
||||||
@ -160,6 +238,13 @@ class active_block {
|
|||||||
offsets_.for_each_value(key, std::forward<F>(func));
|
offsets_.for_each_value(key, std::forward<F>(func));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename F>
|
||||||
|
void for_each_offset_filter(hash_t key, F&& func) const {
|
||||||
|
if (DWARFS_UNLIKELY(filter_.test(key))) {
|
||||||
|
offsets_.for_each_value(key, std::forward<F>(func));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void finalize(bm_stats& stats) {
|
void finalize(bm_stats& stats) {
|
||||||
stats.total_hashes += offsets_.values().size();
|
stats.total_hashes += offsets_.values().size();
|
||||||
for (auto& c : offsets_.collisions()) {
|
for (auto& c : offsets_.collisions()) {
|
||||||
@ -169,11 +254,14 @@ class active_block {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bloom_filter const& filter() const { return filter_; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static constexpr size_t num_inline_offsets = 4;
|
static constexpr size_t num_inline_offsets = 4;
|
||||||
|
|
||||||
size_t num_, capacity_, window_size_, window_step_mask_;
|
size_t num_, capacity_, window_size_, window_step_mask_;
|
||||||
rsync_hash hasher_;
|
rsync_hash hasher_;
|
||||||
|
bloom_filter filter_;
|
||||||
fast_multimap<hash_t, offset_t, num_inline_offsets> offsets_;
|
fast_multimap<hash_t, offset_t, num_inline_offsets> offsets_;
|
||||||
std::shared_ptr<block_data> data_;
|
std::shared_ptr<block_data> data_;
|
||||||
};
|
};
|
||||||
@ -192,7 +280,10 @@ class block_manager_ final : public block_manager::impl {
|
|||||||
? static_cast<size_t>(1) << cfg.blockhash_window_size
|
? static_cast<size_t>(1) << cfg.blockhash_window_size
|
||||||
: 0}
|
: 0}
|
||||||
, window_step_{window_size_ >> cfg.window_increment_shift}
|
, window_step_{window_size_ >> cfg.window_increment_shift}
|
||||||
, block_size_{static_cast<size_t>(1) << cfg.block_size_bits} {}
|
, block_size_{static_cast<size_t>(1) << cfg.block_size_bits}
|
||||||
|
, filter_{bloom_filter_size()} {
|
||||||
|
LOG_INFO << "bloom filter size: " << size_with_unit(filter_.size() / 8);
|
||||||
|
}
|
||||||
|
|
||||||
void add_inode(std::shared_ptr<inode> ino) override;
|
void add_inode(std::shared_ptr<inode> ino) override;
|
||||||
void finish_blocks() override;
|
void finish_blocks() override;
|
||||||
@ -208,6 +299,11 @@ class block_manager_ final : public block_manager::impl {
|
|||||||
void append_to_block(inode& ino, mmif& mm, size_t offset, size_t size);
|
void append_to_block(inode& ino, mmif& mm, size_t offset, size_t size);
|
||||||
void add_data(inode& ino, mmif& mm, size_t offset, size_t size);
|
void add_data(inode& ino, mmif& mm, size_t offset, size_t size);
|
||||||
void segment_and_add_data(inode& ino, mmif& mm, size_t size);
|
void segment_and_add_data(inode& ino, mmif& mm, size_t size);
|
||||||
|
size_t bloom_filter_size() const {
|
||||||
|
auto hash_count = pow2ceil(std::max<size_t>(1, cfg_.max_active_blocks)) *
|
||||||
|
(block_size_ / window_step_);
|
||||||
|
return (1 << cfg_.bloom_filter_size) * hash_count;
|
||||||
|
}
|
||||||
|
|
||||||
log_proxy<LoggerPolicy> log_;
|
log_proxy<LoggerPolicy> log_;
|
||||||
progress& prog_;
|
progress& prog_;
|
||||||
@ -222,6 +318,8 @@ class block_manager_ final : public block_manager::impl {
|
|||||||
|
|
||||||
chunk_state chunk_;
|
chunk_state chunk_;
|
||||||
|
|
||||||
|
bloom_filter filter_;
|
||||||
|
|
||||||
bm_stats stats_;
|
bm_stats stats_;
|
||||||
|
|
||||||
// Active blocks are blocks that can still be referenced from new chunks.
|
// Active blocks are blocks that can still be referenced from new chunks.
|
||||||
@ -259,7 +357,7 @@ class segment_match {
|
|||||||
uint8_t const* data_;
|
uint8_t const* data_;
|
||||||
};
|
};
|
||||||
|
|
||||||
void active_block::append(uint8_t const* p, size_t size) {
|
void active_block::append(uint8_t const* p, size_t size, bloom_filter* filter) {
|
||||||
auto& v = data_->vec();
|
auto& v = data_->vec();
|
||||||
auto offset = v.size();
|
auto offset = v.size();
|
||||||
DWARFS_CHECK(offset + size <= capacity_,
|
DWARFS_CHECK(offset + size <= capacity_,
|
||||||
@ -275,7 +373,12 @@ void active_block::append(uint8_t const* p, size_t size) {
|
|||||||
} else {
|
} else {
|
||||||
hasher_.update(v[offset - window_size_], v[offset]);
|
hasher_.update(v[offset - window_size_], v[offset]);
|
||||||
if (DWARFS_UNLIKELY((++offset & window_step_mask_) == 0)) {
|
if (DWARFS_UNLIKELY((++offset & window_step_mask_) == 0)) {
|
||||||
offsets_.insert(hasher_(), offset - window_size_);
|
auto hashval = hasher_();
|
||||||
|
offsets_.insert(hashval, offset - window_size_);
|
||||||
|
filter_.add(hashval);
|
||||||
|
if (filter) {
|
||||||
|
filter->add(hashval);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -338,6 +441,13 @@ void block_manager_<LoggerPolicy>::finish_blocks() {
|
|||||||
|
|
||||||
auto l1_collisions = stats_.l2_collision_vec_size.computeTotalCount();
|
auto l1_collisions = stats_.l2_collision_vec_size.computeTotalCount();
|
||||||
|
|
||||||
|
LOG_INFO << "bloom filter reject rate: "
|
||||||
|
<< fmt::format("{:.3f}%", 100.0 - 100.0 * stats_.bloom_hits /
|
||||||
|
stats_.bloom_lookups)
|
||||||
|
<< " (TPR="
|
||||||
|
<< fmt::format("{:.3f}%", 100.0 * stats_.bloom_true_positives /
|
||||||
|
stats_.bloom_hits)
|
||||||
|
<< ", lookups=" << stats_.bloom_lookups << ")";
|
||||||
LOG_INFO << "segmentation matches: good=" << stats_.good_matches
|
LOG_INFO << "segmentation matches: good=" << stats_.good_matches
|
||||||
<< ", bad=" << stats_.bad_matches
|
<< ", bad=" << stats_.bad_matches
|
||||||
<< ", total=" << stats_.total_matches;
|
<< ", total=" << stats_.total_matches;
|
||||||
@ -376,14 +486,19 @@ void block_manager_<LoggerPolicy>::append_to_block(inode& ino, mmif& mm,
|
|||||||
blocks_.pop_front();
|
blocks_.pop_front();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
filter_.clear();
|
||||||
|
for (auto const& b : blocks_) {
|
||||||
|
filter_.merge(b.filter());
|
||||||
|
}
|
||||||
|
|
||||||
blocks_.emplace_back(block_count_++, block_size_,
|
blocks_.emplace_back(block_count_++, block_size_,
|
||||||
cfg_.max_active_blocks > 0 ? window_size_ : 0,
|
cfg_.max_active_blocks > 0 ? window_size_ : 0,
|
||||||
window_step_);
|
window_step_, filter_.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
auto& block = blocks_.back();
|
auto& block = blocks_.back();
|
||||||
|
|
||||||
block.append(mm.as<uint8_t>(offset), size);
|
block.append(mm.as<uint8_t>(offset), size, &filter_);
|
||||||
chunk_.size += size;
|
chunk_.size += size;
|
||||||
|
|
||||||
prog_.filesystem_size += size;
|
prog_.filesystem_size += size;
|
||||||
@ -444,15 +559,28 @@ void block_manager_<LoggerPolicy>::segment_and_add_data(inode& ino, mmif& mm,
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::vector<segment_match> matches;
|
std::vector<segment_match> matches;
|
||||||
|
const bool single_block_mode = cfg_.max_active_blocks == 1;
|
||||||
|
|
||||||
while (offset < size) {
|
while (offset < size) {
|
||||||
for (auto const& block : blocks_) {
|
++stats_.bloom_lookups;
|
||||||
|
if (DWARFS_UNLIKELY(filter_.test(hasher()))) {
|
||||||
|
++stats_.bloom_hits;
|
||||||
|
if (single_block_mode) {
|
||||||
|
auto& block = blocks_.front();
|
||||||
block.for_each_offset(hasher(), [&](uint32_t offset) {
|
block.for_each_offset(hasher(), [&](uint32_t offset) {
|
||||||
matches.emplace_back(&block, offset);
|
matches.emplace_back(&block, offset);
|
||||||
});
|
});
|
||||||
|
} else {
|
||||||
|
for (auto const& block : blocks_) {
|
||||||
|
block.for_each_offset_filter(hasher(), [&](uint32_t offset) {
|
||||||
|
matches.emplace_back(&block, offset);
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (DWARFS_UNLIKELY(!matches.empty())) {
|
if (DWARFS_UNLIKELY(!matches.empty())) {
|
||||||
|
++stats_.bloom_true_positives;
|
||||||
|
|
||||||
LOG_TRACE << "found " << matches.size() << " matches (hash=" << hasher()
|
LOG_TRACE << "found " << matches.size() << " matches (hash=" << hasher()
|
||||||
<< ", window size=" << window_size_ << ")";
|
<< ", window size=" << window_size_ << ")";
|
||||||
|
|
||||||
@ -511,6 +639,7 @@ void block_manager_<LoggerPolicy>::segment_and_add_data(inode& ino, mmif& mm,
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// no matches found, see if we can append data
|
// no matches found, see if we can append data
|
||||||
// we need to keep at least lookback_size bytes unwritten
|
// we need to keep at least lookback_size bytes unwritten
|
||||||
|
@ -380,6 +380,9 @@ int mkdwarfs(int argc, char** argv) {
|
|||||||
po::value<unsigned>(&cfg.window_increment_shift)
|
po::value<unsigned>(&cfg.window_increment_shift)
|
||||||
->default_value(1),
|
->default_value(1),
|
||||||
"window step (as right shift of size)")
|
"window step (as right shift of size)")
|
||||||
|
("bloom-filter-size",
|
||||||
|
po::value<unsigned>(&cfg.bloom_filter_size)->default_value(5),
|
||||||
|
"bloom filter size (2^N*values bits)")
|
||||||
("memory-limit,L",
|
("memory-limit,L",
|
||||||
po::value<std::string>(&memory_limit)->default_value("1g"),
|
po::value<std::string>(&memory_limit)->default_value("1g"),
|
||||||
"block manager memory limit")
|
"block manager memory limit")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user