diff --git a/include/dwarfs/writer/segmenter.h b/include/dwarfs/writer/segmenter.h index 92848b9b..2b7635bc 100644 --- a/include/dwarfs/writer/segmenter.h +++ b/include/dwarfs/writer/segmenter.h @@ -61,6 +61,9 @@ class segmenter { using block_ready_cb = std::function; + static size_t + estimate_memory_usage(config const& cfg, compression_constraints const& cc); + segmenter(logger& lgr, writer_progress& prog, std::shared_ptr blkmgr, config const& cfg, compression_constraints const& cc, size_t total_size, diff --git a/include/dwarfs/writer/segmenter_factory.h b/include/dwarfs/writer/segmenter_factory.h index e040036f..fd305089 100644 --- a/include/dwarfs/writer/segmenter_factory.h +++ b/include/dwarfs/writer/segmenter_factory.h @@ -67,6 +67,11 @@ class segmenter_factory { size_t get_block_size() const { return impl_->get_block_size(); } + size_t estimate_memory_usage(fragment_category cat, + compression_constraints const& cc) const { + return impl_->estimate_memory_usage(cat, cc); + } + class impl { public: virtual ~impl() = default; @@ -76,6 +81,9 @@ class segmenter_factory { std::shared_ptr blkmgr, segmenter::block_ready_cb block_ready) const = 0; virtual size_t get_block_size() const = 0; + virtual size_t + estimate_memory_usage(fragment_category cat, + compression_constraints const& cc) const = 0; }; private: diff --git a/src/writer/scanner.cpp b/src/writer/scanner.cpp index b863f891..ce12dfca 100644 --- a/src/writer/scanner.cpp +++ b/src/writer/scanner.cpp @@ -814,6 +814,11 @@ void scanner_::scan( auto cc = fsw.get_compression_constraints(category.value(), meta); + LOG_DEBUG << category_prefix(catmgr, category) + << "segmenter will use up to " + << size_with_unit( + segmenter_factory_.estimate_memory_usage(category, cc)); + wg_blockify.add_job([this, catmgr, blockmgr, category, cat_size, meta, cc, &prog, &fsw, &im, &wg_ordering] { auto span = im.ordered_span(category, wg_ordering); diff --git a/src/writer/segmenter.cpp b/src/writer/segmenter.cpp index 075f278c..b7a616f0 100644 --- a/src/writer/segmenter.cpp +++ b/src/writer/segmenter.cpp @@ -711,6 +711,16 @@ class segmenter_progress : public progress::context { size_t const bytes_total_; }; +DWARFS_FORCE_INLINE size_t window_size(segmenter::config const& cfg) { + return cfg.blockhash_window_size > 0 + ? static_cast(1) << cfg.blockhash_window_size + : 0; +} + +DWARFS_FORCE_INLINE size_t window_step(segmenter::config const& cfg) { + return std::max(1, window_size(cfg) >> cfg.window_increment_shift); +} + template class segmenter_ final : public segmenter::impl, private SegmentingPolicy { private: @@ -794,16 +804,6 @@ class segmenter_ final : public segmenter::impl, private SegmentingPolicy { return 0; } - static DWARFS_FORCE_INLINE size_t window_size(segmenter::config const& cfg) { - return cfg.blockhash_window_size > 0 - ? static_cast(1) << cfg.blockhash_window_size - : 0; - } - - static DWARFS_FORCE_INLINE size_t window_step(segmenter::config const& cfg) { - return std::max(1, window_size(cfg) >> cfg.window_increment_shift); - } - size_t DWARFS_FORCE_INLINE block_size_in_frames(segmenter::config const& cfg) const { auto raw_size = static_cast(1) << cfg.block_size_bits; @@ -1419,4 +1419,37 @@ segmenter::segmenter(logger& lgr, writer_progress& prog, std::move(blkmgr), cfg, cc, total_size, std::move(block_ready))) {} +size_t segmenter::estimate_memory_usage(config const& cfg, + compression_constraints const& cc) { + if (cfg.max_active_blocks == 0 or cfg.blockhash_window_size == 0) { + return 0; + } + + static constexpr size_t kWorstCaseBytesPerOffset = 19; // 8 bytes / 0.4375 + + size_t const granularity = cc.granularity.value_or(1); + size_t const block_size_in_frames = + (static_cast(1) << cfg.block_size_bits) / granularity; + + size_t const win_size = internal::window_size(cfg); + size_t const win_step = internal::window_step(cfg); + size_t const max_offset_count = + (block_size_in_frames - (win_size - win_step)) / win_step; + size_t const bloom_filter_mem = + ((static_cast(1) << cfg.bloom_filter_size) * + std::bit_ceil(cfg.max_active_blocks * + (block_size_in_frames / win_step))) / + 8; + + // Single active block uses memory for: + // - offsets + // - bloom filter (only with MultiBlockSegmentationPolicy) + // We do *not* consider the memory for the block data buffer here + size_t const active_block_mem_usage = + (max_offset_count * kWorstCaseBytesPerOffset) + + (cfg.max_active_blocks > 1 ? bloom_filter_mem : 0); + + return cfg.max_active_blocks * active_block_mem_usage + bloom_filter_mem; +} + } // namespace dwarfs::writer diff --git a/src/writer/segmenter_factory.cpp b/src/writer/segmenter_factory.cpp index eb80d80e..b002ca2c 100644 --- a/src/writer/segmenter_factory.cpp +++ b/src/writer/segmenter_factory.cpp @@ -42,6 +42,22 @@ class segmenter_factory_ final : public segmenter_factory::impl { compression_constraints const& cc, std::shared_ptr blkmgr, segmenter::block_ready_cb block_ready) const override { + return {lgr_, prog_, std::move(blkmgr), make_segmenter_config(cat), + cc, cat_size, std::move(block_ready)}; + } + + size_t get_block_size() const override { + return static_cast(1) << cfg_.block_size_bits; + } + + size_t + estimate_memory_usage(fragment_category cat, + compression_constraints const& cc) const override { + return segmenter::estimate_memory_usage(make_segmenter_config(cat), cc); + } + + private: + segmenter::config make_segmenter_config(fragment_category cat) const { segmenter::config cfg; if (catmgr_) { @@ -54,15 +70,9 @@ class segmenter_factory_ final : public segmenter_factory::impl { cfg.bloom_filter_size = cfg_.bloom_filter_size.get(cat); cfg.block_size_bits = cfg_.block_size_bits; - return {lgr_, prog_, std::move(blkmgr), cfg, - cc, cat_size, std::move(block_ready)}; + return cfg; } - size_t get_block_size() const override { - return static_cast(1) << cfg_.block_size_bits; - } - - private: logger& lgr_; writer_progress& prog_; std::shared_ptr catmgr_;