chore: add segmenter memory usage estimation

This commit is contained in:
Marcus Holland-Moritz 2025-05-25 16:24:18 +02:00
parent 2e78d049a1
commit 44bb5d7357
5 changed files with 76 additions and 17 deletions

View File

@ -61,6 +61,9 @@ class segmenter {
using block_ready_cb = using block_ready_cb =
std::function<void(shared_byte_buffer, size_t logical_block_num)>; std::function<void(shared_byte_buffer, size_t logical_block_num)>;
static size_t
estimate_memory_usage(config const& cfg, compression_constraints const& cc);
segmenter(logger& lgr, writer_progress& prog, segmenter(logger& lgr, writer_progress& prog,
std::shared_ptr<internal::block_manager> blkmgr, config const& cfg, std::shared_ptr<internal::block_manager> blkmgr, config const& cfg,
compression_constraints const& cc, size_t total_size, compression_constraints const& cc, size_t total_size,

View File

@ -67,6 +67,11 @@ class segmenter_factory {
size_t get_block_size() const { return impl_->get_block_size(); } size_t get_block_size() const { return impl_->get_block_size(); }
size_t estimate_memory_usage(fragment_category cat,
compression_constraints const& cc) const {
return impl_->estimate_memory_usage(cat, cc);
}
class impl { class impl {
public: public:
virtual ~impl() = default; virtual ~impl() = default;
@ -76,6 +81,9 @@ class segmenter_factory {
std::shared_ptr<internal::block_manager> blkmgr, std::shared_ptr<internal::block_manager> blkmgr,
segmenter::block_ready_cb block_ready) const = 0; segmenter::block_ready_cb block_ready) const = 0;
virtual size_t get_block_size() const = 0; virtual size_t get_block_size() const = 0;
virtual size_t
estimate_memory_usage(fragment_category cat,
compression_constraints const& cc) const = 0;
}; };
private: private:

View File

@ -814,6 +814,11 @@ void scanner_<LoggerPolicy>::scan(
auto cc = fsw.get_compression_constraints(category.value(), meta); auto cc = fsw.get_compression_constraints(category.value(), meta);
LOG_DEBUG << category_prefix(catmgr, category)
<< "segmenter will use up to "
<< size_with_unit(
segmenter_factory_.estimate_memory_usage(category, cc));
wg_blockify.add_job([this, catmgr, blockmgr, category, cat_size, meta, cc, wg_blockify.add_job([this, catmgr, blockmgr, category, cat_size, meta, cc,
&prog, &fsw, &im, &wg_ordering] { &prog, &fsw, &im, &wg_ordering] {
auto span = im.ordered_span(category, wg_ordering); auto span = im.ordered_span(category, wg_ordering);

View File

@ -711,6 +711,16 @@ class segmenter_progress : public progress::context {
size_t const bytes_total_; size_t const bytes_total_;
}; };
DWARFS_FORCE_INLINE size_t window_size(segmenter::config const& cfg) {
return cfg.blockhash_window_size > 0
? static_cast<size_t>(1) << cfg.blockhash_window_size
: 0;
}
DWARFS_FORCE_INLINE size_t window_step(segmenter::config const& cfg) {
return std::max<size_t>(1, window_size(cfg) >> cfg.window_increment_shift);
}
template <typename LoggerPolicy, typename SegmentingPolicy> template <typename LoggerPolicy, typename SegmentingPolicy>
class segmenter_ final : public segmenter::impl, private SegmentingPolicy { class segmenter_ final : public segmenter::impl, private SegmentingPolicy {
private: private:
@ -794,16 +804,6 @@ class segmenter_ final : public segmenter::impl, private SegmentingPolicy {
return 0; return 0;
} }
static DWARFS_FORCE_INLINE size_t window_size(segmenter::config const& cfg) {
return cfg.blockhash_window_size > 0
? static_cast<size_t>(1) << cfg.blockhash_window_size
: 0;
}
static DWARFS_FORCE_INLINE size_t window_step(segmenter::config const& cfg) {
return std::max<size_t>(1, window_size(cfg) >> cfg.window_increment_shift);
}
size_t DWARFS_FORCE_INLINE size_t DWARFS_FORCE_INLINE
block_size_in_frames(segmenter::config const& cfg) const { block_size_in_frames(segmenter::config const& cfg) const {
auto raw_size = static_cast<size_t>(1) << cfg.block_size_bits; auto raw_size = static_cast<size_t>(1) << cfg.block_size_bits;
@ -1419,4 +1419,37 @@ segmenter::segmenter(logger& lgr, writer_progress& prog,
std::move(blkmgr), cfg, cc, total_size, std::move(blkmgr), cfg, cc, total_size,
std::move(block_ready))) {} std::move(block_ready))) {}
size_t segmenter::estimate_memory_usage(config const& cfg,
compression_constraints const& cc) {
if (cfg.max_active_blocks == 0 or cfg.blockhash_window_size == 0) {
return 0;
}
static constexpr size_t kWorstCaseBytesPerOffset = 19; // 8 bytes / 0.4375
size_t const granularity = cc.granularity.value_or(1);
size_t const block_size_in_frames =
(static_cast<size_t>(1) << cfg.block_size_bits) / granularity;
size_t const win_size = internal::window_size(cfg);
size_t const win_step = internal::window_step(cfg);
size_t const max_offset_count =
(block_size_in_frames - (win_size - win_step)) / win_step;
size_t const bloom_filter_mem =
((static_cast<size_t>(1) << cfg.bloom_filter_size) *
std::bit_ceil(cfg.max_active_blocks *
(block_size_in_frames / win_step))) /
8;
// Single active block uses memory for:
// - offsets
// - bloom filter (only with MultiBlockSegmentationPolicy)
// We do *not* consider the memory for the block data buffer here
size_t const active_block_mem_usage =
(max_offset_count * kWorstCaseBytesPerOffset) +
(cfg.max_active_blocks > 1 ? bloom_filter_mem : 0);
return cfg.max_active_blocks * active_block_mem_usage + bloom_filter_mem;
}
} // namespace dwarfs::writer } // namespace dwarfs::writer

View File

@ -42,6 +42,22 @@ class segmenter_factory_ final : public segmenter_factory::impl {
compression_constraints const& cc, compression_constraints const& cc,
std::shared_ptr<block_manager> blkmgr, std::shared_ptr<block_manager> blkmgr,
segmenter::block_ready_cb block_ready) const override { segmenter::block_ready_cb block_ready) const override {
return {lgr_, prog_, std::move(blkmgr), make_segmenter_config(cat),
cc, cat_size, std::move(block_ready)};
}
size_t get_block_size() const override {
return static_cast<size_t>(1) << cfg_.block_size_bits;
}
size_t
estimate_memory_usage(fragment_category cat,
compression_constraints const& cc) const override {
return segmenter::estimate_memory_usage(make_segmenter_config(cat), cc);
}
private:
segmenter::config make_segmenter_config(fragment_category cat) const {
segmenter::config cfg; segmenter::config cfg;
if (catmgr_) { if (catmgr_) {
@ -54,15 +70,9 @@ class segmenter_factory_ final : public segmenter_factory::impl {
cfg.bloom_filter_size = cfg_.bloom_filter_size.get(cat); cfg.bloom_filter_size = cfg_.bloom_filter_size.get(cat);
cfg.block_size_bits = cfg_.block_size_bits; cfg.block_size_bits = cfg_.block_size_bits;
return {lgr_, prog_, std::move(blkmgr), cfg, return cfg;
cc, cat_size, std::move(block_ready)};
} }
size_t get_block_size() const override {
return static_cast<size_t>(1) << cfg_.block_size_bits;
}
private:
logger& lgr_; logger& lgr_;
writer_progress& prog_; writer_progress& prog_;
std::shared_ptr<categorizer_manager> catmgr_; std::shared_ptr<categorizer_manager> catmgr_;