feat(metadata): add block category information

This commit is contained in:
Marcus Holland-Moritz 2023-11-28 10:30:41 +01:00
parent 6aba533688
commit 95eac672f8
7 changed files with 97 additions and 23 deletions

View File

@ -25,6 +25,7 @@
#include <optional>
#include <vector>
#include "dwarfs/fragment_category.h"
#include "dwarfs/gen-cpp2/metadata_types.h"
namespace dwarfs {
@ -34,13 +35,17 @@ class block_manager {
using chunk_type = thrift::metadata::chunk;
size_t get_logical_block() const;
void set_written_block(size_t logical_block, size_t written_block);
void set_written_block(size_t logical_block, size_t written_block,
fragment_category::value_type category);
void map_logical_blocks(std::vector<chunk_type>& vec);
std::vector<fragment_category::value_type>
get_written_block_categories() const;
private:
std::mutex mutable mx_;
size_t mutable num_blocks_{0};
std::vector<std::optional<size_t>> block_map_;
std::vector<std::optional<std::pair<size_t, fragment_category::value_type>>>
block_map_;
};
} // namespace dwarfs

View File

@ -48,9 +48,8 @@ class segmenter {
unsigned block_size_bits{22};
};
using block_ready_cb =
folly::Function<void(std::shared_ptr<block_data>,
folly::Function<void(size_t)> physical_block_cb)>;
using block_ready_cb = folly::Function<void(std::shared_ptr<block_data>,
size_t logical_block_num)>;
segmenter(logger& lgr, progress& prog, std::shared_ptr<block_manager> blkmgr,
config const& cfg, compression_constraints const& cc,

View File

@ -35,13 +35,14 @@ size_t block_manager::get_logical_block() const {
}
void block_manager::set_written_block(size_t logical_block,
size_t written_block) {
size_t written_block,
fragment_category::value_type category) {
std::lock_guard lock{mx_};
assert(logical_block < num_blocks_);
if (block_map_.size() < num_blocks_) {
block_map_.resize(num_blocks_);
}
block_map_[logical_block] = written_block;
block_map_[logical_block] = std::make_pair(written_block, category);
}
void block_manager::map_logical_blocks(std::vector<chunk_type>& vec) {
@ -49,8 +50,26 @@ void block_manager::map_logical_blocks(std::vector<chunk_type>& vec) {
for (auto& c : vec) {
size_t block = c.get_block();
assert(block < num_blocks_);
c.block() = block_map_[block].value();
c.block() = block_map_[block].value().first;
}
}
std::vector<fragment_category::value_type>
block_manager::get_written_block_categories() const {
std::vector<fragment_category::value_type> result;
{
std::lock_guard lock{mx_};
result.resize(num_blocks_);
for (auto& b : block_map_) {
auto& mapping = b.value();
result[mapping.first] = mapping.second;
}
}
return result;
}
} // namespace dwarfs

View File

@ -700,9 +700,17 @@ void scanner_<LoggerPolicy>::scan(
auto seg = segmenter_factory_->create(
category, cat_size, cc, blockmgr,
[category, meta, &fsw](auto block, auto physical_block_cb) {
fsw.write_block(category, std::move(block),
std::move(physical_block_cb), meta);
[category, meta, blockmgr, &fsw](auto block,
auto logical_block_num) {
fsw.write_block(
category, std::move(block),
[blockmgr, logical_block_num,
category](auto physical_block_num) {
blockmgr->set_written_block(logical_block_num,
physical_block_num,
category.value());
},
meta);
});
for (auto ino : span) {
@ -862,6 +870,33 @@ void scanner_<LoggerPolicy>::scan(
mv2.preferred_path_separator() =
static_cast<uint32_t>(std::filesystem::path::preferred_separator);
if (auto catmgr = options_.inode.categorizer_mgr) {
std::unordered_map<fragment_category::value_type,
fragment_category::value_type>
category_indices;
std::vector<std::string> category_names;
category_indices.reserve(frag_info.info.size());
category_names.reserve(frag_info.info.size());
for (auto const& ci : frag_info.info) {
auto [it, inserted] =
category_indices.emplace(ci.category, category_names.size());
if (inserted) {
category_names.emplace_back(catmgr->category_name(ci.category));
}
}
auto written_categories = blockmgr->get_written_block_categories();
std::transform(written_categories.begin(), written_categories.end(),
written_categories.begin(),
[&](auto const& cat) { return category_indices.at(cat); });
mv2.category_names() = std::move(category_names);
mv2.block_categories() = std::move(written_categories);
}
auto [schema, data] = metadata_v2::freeze(mv2);
LOG_VERBOSE << "uncompressed metadata size: " << size_with_unit(data.size());

View File

@ -969,11 +969,7 @@ template <typename LoggerPolicy, typename SegmentingPolicy>
void segmenter_<LoggerPolicy, SegmentingPolicy>::block_ready() {
auto& block = blocks_.back();
block.finalize(stats_);
block_ready_(block.data(), [blkmgr = blkmgr_,
logical_block_num =
block.num()](size_t physical_block_num) {
blkmgr->set_written_block(logical_block_num, physical_block_num);
});
block_ready_(block.data(), block.num());
++prog_.block_count;
}

View File

@ -144,13 +144,14 @@ void run_segmenter_test(unsigned iters, unsigned granularity,
std::vector<std::shared_ptr<dwarfs::block_data>> written;
dwarfs::segmenter seg(lgr, prog, blkmgr, cfg, cc, total_size,
[&written](std::shared_ptr<dwarfs::block_data> blk,
auto physical_block_cb) {
size_t num = written.size();
written.push_back(blk);
physical_block_cb(num);
});
dwarfs::segmenter seg(
lgr, prog, blkmgr, cfg, cc, total_size,
[&written, blkmgr](std::shared_ptr<dwarfs::block_data> blk,
auto logical_block_num) {
auto physical_block_num = written.size();
written.push_back(blk);
blkmgr->set_written_block(logical_block_num, physical_block_num, 0);
});
suspender.dismiss();

View File

@ -361,8 +361,27 @@ struct metadata {
// fields added with dwarfs-0.7.3, file system version 2.5 //
//=========================================================//
// We don't need to increment the file system minor version
// as file systems created with this new version are still
// readable by older binaries as long as they don't use any
// unsupported features (e.g. FLAC compression).
// The set of features used in this file system image. As long
// as an older binary supports all features, it will be able
// to use images created with newer versions.
27: optional set<string> features
//=========================================================//
// fields added with dwarfs-0.8.0, file system version 2.5 //
//=========================================================//
// The set of categories used in this file system image. Used
// for displaying and to select compression algorithms when
// recompressing the image.
28: optional list<string> category_names
// The category of each block in the file system image. The
// index into this vector is the block number and the value
// is an index into `category_names`.
29: optional list<UInt32> block_categories
}