feat: add --change-block-size

This commit is contained in:
Marcus Holland-Moritz 2025-05-15 10:17:54 +02:00
parent e5e0a36ea5
commit e7c35d2402
12 changed files with 648 additions and 93 deletions

View File

@ -242,13 +242,19 @@ Most other options are concerned with compression tuning:
- `--rebuild-metadata`:
Completely rebuild the metadata block. This will upgrade the internal format
of the metadata to the latest version instead of just recompressing the
metadata block.
metadata block. Implies `--recompress=metadata`.
- `--change-block-size`:
Change the block size while recompressing. This will change the block size
according to the size given in `--block-size-bits`. Even if the block size
is unchanged, this will still re-order and re-compress *all* blocks. Implies
`--recompress=all` and `--rebuild-metadata`.
- `--recompress-categories=`[`!`]*category*[`,`...]:
When `--recompress` is set to `all` or `block`, this option controls
which categories of blocks will be recompressed. Adding a `!` in front
of the list allows you to specify which categories will *not* be
recompressed.
recompressed. Cannot be used with `--change-block-size`.
- `-P`, `--pack-metadata=auto`|`none`|[`all`|`chunk_table`|`directories`|`shared_files`|`names`|`names_index`|`symlinks`|`symlinks_index`|`force`|`plain`[`,`...]]:
Which metadata information to store in packed format. This is primarily

View File

@ -487,6 +487,9 @@ class filesystem_v2 final : public filesystem_v2_lite {
std::unique_ptr<thrift::metadata::fs_options> thawed_fs_options() const;
std::future<block_range>
read_raw_block_data(size_t block_no, size_t offset, size_t size) const;
class impl : public impl_lite {
public:
virtual int
@ -504,6 +507,8 @@ class filesystem_v2 final : public filesystem_v2_lite {
unpacked_metadata() const = 0;
virtual std::unique_ptr<thrift::metadata::fs_options>
thawed_fs_options() const = 0;
virtual std::future<block_range>
read_raw_block_data(size_t block, size_t offset, size_t size) const = 0;
};
private:

View File

@ -105,6 +105,11 @@ class inode_reader_v2 {
impl_->cache_blocks(blocks);
}
std::future<block_range>
read_raw_block_data(size_t block_no, size_t offset, size_t size) const {
return impl_->read_raw_block_data(block_no, offset, size);
}
class impl {
public:
virtual ~impl() = default;
@ -127,6 +132,8 @@ class inode_reader_v2 {
virtual void set_cache_tidy_config(cache_tidy_config const& cfg) = 0;
virtual size_t num_blocks() const = 0;
virtual void cache_blocks(std::span<size_t const> blocks) const = 0;
virtual std::future<block_range>
read_raw_block_data(size_t block_no, size_t offset, size_t size) const = 0;
};
private:

View File

@ -36,6 +36,7 @@ namespace dwarfs::utility {
struct rewrite_options {
bool recompress_block{false};
bool recompress_metadata{false};
std::optional<size_t> change_block_size;
std::optional<writer::metadata_options> rebuild_metadata;
std::unordered_set<std::string> recompress_categories;
bool recompress_categories_exclude{false};

View File

@ -32,6 +32,8 @@
#include <utility>
#include <vector>
#include <folly/Function.h>
#include <dwarfs/block_compressor.h>
#include <dwarfs/byte_buffer.h>
#include <dwarfs/compression_constraints.h>
@ -48,6 +50,15 @@ class fs_section;
namespace writer::internal {
struct block_compression_info {
size_t uncompressed_size{};
std::optional<std::string> metadata;
std::optional<compression_constraints> constraints;
};
using delayed_data_fn_type = folly::Function<
std::pair<shared_byte_buffer, std::optional<std::string>>()>;
class filesystem_writer_detail {
public:
virtual ~filesystem_writer_detail() = default;
@ -81,11 +92,15 @@ class filesystem_writer_detail {
virtual void write_history(shared_byte_buffer data) = 0;
virtual void check_block_compression(
compression_type compression, std::span<uint8_t const> data,
std::optional<fragment_category::value_type> cat = std::nullopt) = 0;
std::optional<fragment_category::value_type> cat = std::nullopt,
block_compression_info* info = nullptr) = 0;
virtual void write_section(
section_type type, compression_type compression,
std::span<uint8_t const> data,
std::optional<fragment_category::value_type> cat = std::nullopt) = 0;
virtual void rewrite_block(
delayed_data_fn_type data, size_t uncompressed_size,
std::optional<fragment_category::value_type> cat = std::nullopt) = 0;
virtual void write_compressed_section(dwarfs::internal::fs_section const& sec,
std::span<uint8_t const> data) = 0;
virtual void flush() = 0;

View File

@ -49,6 +49,19 @@ class inode_manager;
class block_manager;
class dir;
struct block_chunk {
size_t block{};
size_t offset{};
size_t size{};
};
struct block_mapping {
size_t old_block{};
std::vector<block_chunk> chunks{};
std::vector<block_chunk> map_chunk(size_t offset, size_t size) const;
};
class metadata_builder {
public:
// Start with empty metadata
@ -125,6 +138,10 @@ class metadata_builder {
impl_->gather_global_entry_data(ge_data);
}
void remap_blocks(std::span<block_mapping const> mapping) {
impl_->remap_blocks(mapping);
}
thrift::metadata::metadata const& build() { return impl_->build(); }
class impl {
@ -152,6 +169,7 @@ class metadata_builder {
gather_entries(std::span<dir*> dirs, global_entry_data const& ge_data,
uint32_t num_inodes) = 0;
virtual void gather_global_entry_data(global_entry_data const& ge_data) = 0;
virtual void remap_blocks(std::span<block_mapping const> mapping) = 0;
virtual thrift::metadata::metadata const& build() = 0;
};

View File

@ -345,6 +345,11 @@ class filesystem_ final {
return metadata_v2_utils(meta_).thaw_fs_options();
}
std::future<block_range>
read_raw_block_data(size_t block_no, size_t offset, size_t size) const {
return ir_.read_raw_block_data(block_no, offset, size);
}
private:
filesystem_parser make_fs_parser() const {
return filesystem_parser(mm_, image_offset_, options_.image_size);
@ -1387,6 +1392,10 @@ class filesystem_full_
thawed_fs_options() const override {
return fs().thawed_fs_options();
}
std::future<block_range> read_raw_block_data(size_t block_no, size_t offset,
size_t size) const override {
return fs().read_raw_block_data(block_no, offset, size);
}
private:
history history_;
@ -1519,6 +1528,12 @@ filesystem_v2::thawed_fs_options() const {
return full_().thawed_fs_options();
}
std::future<block_range>
filesystem_v2::read_raw_block_data(size_t block_no, size_t offset,
size_t size) const {
return full_().read_raw_block_data(block_no, offset, size);
}
auto filesystem_v2::full_() const -> impl const& { return this->as_<impl>(); }
} // namespace dwarfs::reader

View File

@ -158,6 +158,9 @@ class inode_reader_ final : public inode_reader_v2::impl {
}
}
std::future<block_range> read_raw_block_data(size_t block_no, size_t offset,
size_t size) const override;
private:
using offset_cache_type =
basic_offset_cache<uint32_t, file_off_t, size_t,
@ -249,6 +252,13 @@ void inode_reader_<LoggerPolicy>::do_readahead(uint32_t inode,
}
}
template <typename LoggerPolicy>
std::future<block_range>
inode_reader_<LoggerPolicy>::read_raw_block_data(size_t block_no, size_t offset,
size_t size) const {
return cache_.get(block_no, offset, size);
}
template <typename LoggerPolicy>
std::vector<std::future<block_range>>
inode_reader_<LoggerPolicy>::read_internal(uint32_t inode, size_t const size,

View File

@ -21,8 +21,11 @@
* SPDX-License-Identifier: GPL-3.0-only
*/
#include <vector>
#include <dwarfs/history.h>
#include <dwarfs/logger.h>
#include <dwarfs/malloc_byte_buffer.h>
#include <dwarfs/reader/filesystem_v2.h>
#include <dwarfs/util.h>
#include <dwarfs/utility/rewrite_options.h>
@ -38,6 +41,217 @@
namespace dwarfs::utility {
namespace {
/*
In order to be able to change the block size, we need to first build a list
of all blocks, along with their categories *and* category-specific metadata.
Only blocks in the same category and with the same metadata are eligible for
merging. While category/metadata is mostly irrelevant for splitting, splitting
requires us to know the compression constraints (i.e. the granularity of the
data) so we can split at the correct boundaries.
Granularity also makes splitting/merging more complicated, as we potentially
cannot simply split a block because one of the new blocks would be larger than
the block size. In which case we must move the excess data to the next block,
and so on. Simpilarly, when merging blocks, we can potentially fill up the
block with data from the next block.
So, ultimately, we need to define for each block in the rewritten filesystem
image the chunks of which it is made up. This mapping will not only be used
to build the new blocks, but also to rebuild the metadata. In the metadata,
both the chunks *and* the chunk table must be updated, since individual chunks
can be either merged or split as well. If we want to be super accurate, we
would also need to update the inode size cache; but this would only be relevant
if we go from a really large block size to a really small one. Then again, it
shouldn't be too hard to update the cache. What we *definitely* need to update
is the `block_categories` as well as the `block_category_metadata` tables in
the metadata.
So, what we need:
- A list of all blocks, along with their categories and metadata
*/
struct block_info {
size_t block{};
size_t uncompressed_size{};
std::optional<dwarfs::internal::fs_section> section;
std::optional<std::string> category_name;
std::optional<std::string> metadata;
std::optional<compression_constraints> constraints;
};
/*
- An algorithm for splitting/merging that outputs the new block positions
(numbers) and the chunks that make up each block
struct block_chunk { // see metadata_builder.h
size_t block;
size_t offset;
size_t size;
};
*/
struct new_block_mapping {
size_t block{};
size_t size{};
std::vector<dwarfs::writer::internal::block_chunk> chunks{};
std::optional<std::string> category_name;
std::optional<std::string> metadata;
};
/*
- The algorithm should be deterministic. It doesn't have to be reversible,
i.e. splitting then merging or merging then splitting doesn't have to
yield the same result (or even the original filesystem image). But when
splitting or merging, the result should *always* be the same given the
same input. That means we *could* actually consider grouping blocks by
category and metadata in the output.
TODO: Check if we've gone from a compression with constraints to one
without (i.e. granularity 3 -> 1) and want to go back to the
original compression *without* a block size change, that should
fail early.
We need two new features to support this:
- `filesystem_v2` must allow reading raw block data (i.e. not file-based).
That way, we can easily make use of the block cache while re-composing
the blocks.
- `filesystem_writer` must allow delayed reading of the block data. We
can hopefully refactor the `rewritten_fsblock` to support this.
How does the remapping process work in the metadata builder?
The chunk_table is just a list of the first chunk of each regular file
inode, plus a sentinel at the end. Basically, we need to traverse the
chunk_table and the chunks it references and build new versions of the
chunk_table and chunks using the new blocks.
To build a new chunk from an old chunk, we must be able figure out which
new blocks an old block is mapped to. This is sort of the opposite of
`mapped_block_info`, where we have stored which chunks of old blocks
make up a new block. So we need a second mapping:
struct block_mapping { // see metadata_builder.h
size_t old_block;
std::vector<block_chunk> chunks;
};
*/
struct rw_block_mappings {
std::vector<new_block_mapping> new_to_old;
std::vector<dwarfs::writer::internal::block_mapping> old_to_new;
};
rw_block_mappings build_block_mappings(std::span<block_info const> blocks,
size_t const block_size) {
using stream_id =
std::pair<std::optional<std::string>, std::optional<std::string>>;
std::vector<std::vector<size_t>> streams;
std::map<stream_id, size_t> stream_map;
for (auto const& b : blocks) {
stream_id id{b.category_name, b.metadata};
auto [it, inserted] = stream_map.try_emplace(id, streams.size());
if (inserted) {
streams.emplace_back();
}
streams[it->second].push_back(b.block);
}
rw_block_mappings result;
for (auto const& stream : streams) {
size_t granularity{1};
if (auto const& cc = blocks[stream[0]].constraints; cc && cc->granularity) {
granularity = cc->granularity.value();
}
size_t const max_stream_block_size{granularity *
(block_size / granularity)};
std::vector<new_block_mapping> mapped;
for (size_t block : stream) {
result.old_to_new.push_back({.old_block = block});
auto& old_to_new = result.old_to_new.back();
auto const& b = blocks[block];
size_t offset{0};
while (offset < b.uncompressed_size) {
if (mapped.empty() || mapped.back().size == max_stream_block_size) {
mapped.push_back({.block = result.new_to_old.size() + mapped.size(),
.category_name = b.category_name,
.metadata = b.metadata});
}
auto& m = mapped.back();
size_t const chunk_size{std::min(b.uncompressed_size - offset,
max_stream_block_size - m.size)};
DWARFS_CHECK(chunk_size % granularity == 0,
fmt::format("chunk_size ({}) % granularity ({}) != 0",
chunk_size, granularity));
old_to_new.chunks.push_back(
{.block = m.block, .offset = m.size, .size = chunk_size});
m.chunks.push_back(
{.block = block, .offset = offset, .size = chunk_size});
m.size += chunk_size;
offset += chunk_size;
}
}
std::ranges::move(mapped, std::back_inserter(result.new_to_old));
}
std::ranges::sort(result.old_to_new, [](auto const& a, auto const& b) {
return a.old_block < b.old_block;
});
return result;
}
std::string block_mappings_to_string(rw_block_mappings const& mapped) {
std::ostringstream oss;
for (auto const& m : mapped.new_to_old) {
oss << "new block " << m.block << " (size " << m.size;
if (m.category_name) {
oss << ", category " << *m.category_name;
}
if (m.metadata) {
oss << ", metadata " << *m.metadata;
}
oss << "):\n";
for (auto const& c : m.chunks) {
oss << " chunk: old block " << c.block << ", offset " << c.offset
<< ", size " << c.size << "\n";
}
}
for (auto const& m : mapped.old_to_new) {
oss << "old block " << m.old_block << ":\n";
for (auto const& c : m.chunks) {
oss << " chunk: new block " << c.block << ", offset " << c.offset
<< ", size " << c.size << "\n";
}
}
return oss.str();
}
} // namespace
void rewrite_filesystem(logger& lgr, dwarfs::reader::filesystem_v2 const& fs,
dwarfs::writer::filesystem_writer& fs_writer,
dwarfs::writer::category_resolver const& cat_resolver,
@ -46,6 +260,18 @@ void rewrite_filesystem(logger& lgr, dwarfs::reader::filesystem_v2 const& fs,
LOG_PROXY(debug_logger_policy, lgr);
if (opts.change_block_size) {
DWARFS_CHECK(opts.recompress_block,
"change_block_size requires recompress_block");
DWARFS_CHECK(opts.recompress_metadata,
"change_block_size requires recompress_metadata");
DWARFS_CHECK(opts.rebuild_metadata,
"change_block_size requires rebuild_metadata");
}
std::vector<block_info> blocks;
rw_block_mappings mapped_blocks;
auto parser = fs.get_parser();
auto& writer = fs_writer.get_internal();
@ -54,20 +280,64 @@ void rewrite_filesystem(logger& lgr, dwarfs::reader::filesystem_v2 const& fs,
size_t block_no{0};
parser->rewind();
while (auto s = parser->next_section()) {
if (s->type() == section_type::BLOCK) {
if (auto catstr = fs.get_block_category(block_no)) {
if (auto cat = cat_resolver.category_value(catstr.value())) {
writer.check_block_compression(s->compression(),
parser->section_data(*s), cat);
{
auto tv = LOG_TIMED_VERBOSE;
while (auto s = parser->next_section()) {
if (s->type() == section_type::BLOCK) {
dwarfs::writer::internal::block_compression_info bci;
auto catstr = fs.get_block_category(block_no);
std::optional<fragment_category::value_type> cat;
if (catstr) {
cat = cat_resolver.category_value(catstr.value());
}
writer.check_block_compression(
s->compression(), parser->section_data(*s), cat,
opts.change_block_size ? &bci : nullptr);
if (opts.change_block_size) {
DWARFS_CHECK(block_no == blocks.size(),
fmt::format("block_no ({}) != blocks.size() ({})",
block_no, blocks.size()));
LOG_DEBUG << "adding block " << block_no
<< " uncompressed size: " << bci.uncompressed_size;
auto& info = blocks.emplace_back();
info.block = block_no;
info.uncompressed_size = bci.uncompressed_size;
info.section = s;
info.category_name = catstr;
info.metadata = bci.metadata;
info.constraints = bci.constraints;
}
++block_no;
}
++block_no;
}
tv << "checked compression for " << block_no << " blocks";
}
if (opts.change_block_size) {
{
auto tv = LOG_TIMED_VERBOSE;
mapped_blocks =
build_block_mappings(blocks, opts.change_block_size.value());
tv << "mapped " << blocks.size() << " source blocks to "
<< mapped_blocks.new_to_old.size() << " target blocks";
}
LOG_DEBUG << block_mappings_to_string(mapped_blocks);
}
}
writer.configure_rewrite(parser->filesystem_size(), fs.num_blocks());
writer.configure_rewrite(parser->filesystem_size(),
opts.change_block_size
? mapped_blocks.new_to_old.size()
: fs.num_blocks());
if (auto header = parser->header()) {
writer.copy_header(*header);
@ -121,49 +391,77 @@ void rewrite_filesystem(logger& lgr, dwarfs::reader::filesystem_v2 const& fs,
return false;
};
if (opts.change_block_size) {
for (auto const& m : mapped_blocks.new_to_old) {
std::optional<fragment_category::value_type> cat;
if (m.category_name) {
cat = cat_resolver.category_value(m.category_name.value());
}
writer.rewrite_block(
[&] {
auto data = malloc_byte_buffer::create_reserve(m.size);
for (auto const& c : m.chunks) {
auto range =
fs.read_raw_block_data(c.block, c.offset, c.size).get();
data.append(range.data(), range.size());
}
DWARFS_CHECK(data.size() == m.size,
fmt::format("data size {} != expected size {}",
data.size(), m.size));
return std::pair{data.share(), m.metadata};
},
m.size, cat);
}
}
parser->rewind();
while (auto s = parser->next_section()) {
switch (s->type()) {
case section_type::BLOCK: {
std::optional<fragment_category::value_type> cat;
bool recompress_block{opts.recompress_block};
case section_type::BLOCK:
if (!opts.change_block_size) {
std::optional<fragment_category::value_type> cat;
bool recompress_block{opts.recompress_block};
if (recompress_block) {
auto catstr = fs.get_block_category(block_no);
if (recompress_block) {
auto catstr = fs.get_block_category(block_no);
if (catstr) {
cat = cat_resolver.category_value(catstr.value());
if (catstr) {
cat = cat_resolver.category_value(catstr.value());
if (!cat) {
LOG_ERROR << "unknown category '" << catstr.value()
<< "' for block " << block_no;
}
if (!cat) {
LOG_ERROR << "unknown category '" << catstr.value()
<< "' for block " << block_no;
}
if (!opts.recompress_categories.empty()) {
bool is_in_set{opts.recompress_categories.contains(catstr.value())};
if (!opts.recompress_categories.empty()) {
bool is_in_set{
opts.recompress_categories.contains(catstr.value())};
recompress_block =
opts.recompress_categories_exclude ? !is_in_set : is_in_set;
recompress_block =
opts.recompress_categories_exclude ? !is_in_set : is_in_set;
}
}
}
if (recompress_block && from_none_to_none(s, cat)) {
recompress_block = false;
}
if (recompress_block) {
log_recompress(s, cat);
writer.write_section(section_type::BLOCK, s->compression(),
parser->section_data(*s), cat);
} else {
copy_compressed(s, cat);
}
++block_no;
}
if (recompress_block && from_none_to_none(s, cat)) {
recompress_block = false;
}
if (recompress_block) {
log_recompress(s, cat);
writer.write_section(section_type::BLOCK, s->compression(),
parser->section_data(*s), cat);
} else {
copy_compressed(s, cat);
}
++block_no;
} break;
break;
case section_type::METADATA_V2_SCHEMA:
case section_type::METADATA_V2:
@ -178,6 +476,12 @@ void rewrite_filesystem(logger& lgr, dwarfs::reader::filesystem_v2 const& fs,
auto builder =
metadata_builder(lgr, std::move(*md), fsopts.get(), fs.version(),
opts.rebuild_metadata.value());
if (opts.change_block_size) {
builder.set_block_size(opts.change_block_size.value());
builder.remap_blocks(mapped_blocks.old_to_new);
}
auto [schema, data] =
metadata_freezer(LOG_GET_LOGGER).freeze(builder.build());

View File

@ -133,7 +133,7 @@ class fsblock {
std::shared_ptr<compression_progress> pctx);
fsblock(section_type type, block_compressor const& bc,
std::span<uint8_t const> data, compression_type data_comp_type,
delayed_data_fn_type data, size_t uncompressed_size,
std::shared_ptr<compression_progress> pctx);
void
@ -371,26 +371,27 @@ class compressed_fsblock : public fsblock::impl {
class rewritten_fsblock : public fsblock::impl {
public:
rewritten_fsblock(section_type type, block_compressor const& bc,
std::span<uint8_t const> data,
compression_type data_comp_type,
delayed_data_fn_type data, size_t uncompressed_size,
std::shared_ptr<compression_progress> pctx)
: type_{type}
, bc_{bc}
, data_{data}
, data_{std::move(data)}
, comp_type_{bc_.type()}
, pctx_{std::move(pctx)}
, data_comp_type_{data_comp_type} {
, uncompressed_size_{uncompressed_size} {
DWARFS_CHECK(bc_, "block_compressor must not be null");
}
void compress(worker_group& wg, std::optional<std::string> meta) override {
DWARFS_CHECK(!meta,
"metadata not supported for rewritten_fsblock::compress");
std::promise<void> prom;
future_ = prom.get_future();
wg.add_job(
[this, prom = std::move(prom), meta = std::move(meta)]() mutable {
compress_job(std::move(prom), std::move(meta));
});
wg.add_job([this, prom = std::move(prom)]() mutable {
compress_job(std::move(prom));
});
}
void wait_until_compressed() override { future_.get(); }
@ -406,7 +407,7 @@ class rewritten_fsblock : public fsblock::impl {
return block_data_.value().span();
}
size_t uncompressed_size() const override { return data_.size(); }
size_t uncompressed_size() const override { return uncompressed_size_; }
size_t size() const override {
std::lock_guard lock(mx_);
@ -444,31 +445,20 @@ class rewritten_fsblock : public fsblock::impl {
}
private:
void compress_job(std::promise<void> prom, std::optional<std::string> meta) {
void compress_job(std::promise<void> prom) {
try {
shared_byte_buffer block;
auto [block, meta] = data_();
{
// TODO: we don't have to do this for uncompressed blocks
block_decompressor bd(data_comp_type_, data_);
block = bd.start_decompression(malloc_byte_buffer::create());
bd.decompress_frame(bd.uncompressed_size());
pctx_->bytes_in += block.size(); // TODO: data_.size()?
if (!meta) {
meta = bd.metadata();
}
pctx_->bytes_in += block.size(); // TODO: data_.size()?
try {
if (meta) {
block = bc_.compress(block, *meta);
} else {
block = bc_.compress(block);
}
} catch (bad_compression_ratio_error const&) {
comp_type_ = compression_type::NONE;
try {
if (meta) {
block = bc_.compress(block, *meta);
} else {
block = bc_.compress(block);
}
} catch (bad_compression_ratio_error const&) {
comp_type_ = compression_type::NONE;
}
pctx_->bytes_out += block.size();
@ -487,14 +477,14 @@ class rewritten_fsblock : public fsblock::impl {
section_type const type_;
block_compressor const& bc_;
mutable std::recursive_mutex mx_;
std::span<uint8_t const> data_;
delayed_data_fn_type data_;
std::optional<shared_byte_buffer> block_data_;
std::future<void> future_;
std::optional<uint32_t> number_;
std::optional<section_header_v2> mutable header_;
compression_type comp_type_;
std::shared_ptr<compression_progress> pctx_;
compression_type const data_comp_type_;
size_t const uncompressed_size_;
};
fsblock::fsblock(section_type type, block_compressor const& bc,
@ -515,10 +505,10 @@ fsblock::fsblock(fs_section sec, std::span<uint8_t const> data,
std::move(pctx))) {}
fsblock::fsblock(section_type type, block_compressor const& bc,
std::span<uint8_t const> data, compression_type data_comp_type,
delayed_data_fn_type data, size_t uncompressed_size,
std::shared_ptr<compression_progress> pctx)
: impl_(std::make_unique<rewritten_fsblock>(type, bc, data, data_comp_type,
std::move(pctx))) {}
: impl_(std::make_unique<rewritten_fsblock>(
type, bc, std::move(data), uncompressed_size, std::move(pctx))) {}
void fsblock::build_section_header(section_header_v2& sh,
fsblock::impl const& fsb,
@ -601,12 +591,15 @@ class filesystem_writer_ final : public filesystem_writer_detail {
void write_metadata_v2_schema(shared_byte_buffer data) override;
void write_metadata_v2(shared_byte_buffer data) override;
void write_history(shared_byte_buffer data) override;
void check_block_compression(
compression_type compression, std::span<uint8_t const> data,
std::optional<fragment_category::value_type> cat) override;
void check_block_compression(compression_type compression,
std::span<uint8_t const> data,
std::optional<fragment_category::value_type> cat,
block_compression_info* info) override;
void write_section(section_type type, compression_type compression,
std::span<uint8_t const> data,
std::optional<fragment_category::value_type> cat) override;
void rewrite_block(delayed_data_fn_type data, size_t uncompressed_size,
std::optional<fragment_category::value_type> cat) override;
void write_compressed_section(fs_section const& sec,
std::span<uint8_t const> data) override;
void flush() override;
@ -624,6 +617,10 @@ class filesystem_writer_ final : public filesystem_writer_detail {
write_block_impl(fragment_category cat, shared_byte_buffer data,
block_compressor const& bc, std::optional<std::string> meta,
physical_block_cb_type physical_block_cb);
void
write_section_delayed_data(section_type type, delayed_data_fn_type data,
size_t uncompressed_size,
std::optional<fragment_category::value_type> cat);
void on_block_merged(block_holder_type holder);
void write_section_impl(section_type type, shared_byte_buffer data);
void write(fsblock const& fsb);
@ -888,7 +885,8 @@ void filesystem_writer_<LoggerPolicy>::write_section_impl(
template <typename LoggerPolicy>
void filesystem_writer_<LoggerPolicy>::check_block_compression(
compression_type compression, std::span<uint8_t const> data,
std::optional<fragment_category::value_type> cat) {
std::optional<fragment_category::value_type> cat,
block_compression_info* info) {
block_compressor const* bc{nullptr};
if (cat) {
@ -897,11 +895,11 @@ void filesystem_writer_<LoggerPolicy>::check_block_compression(
bc = &default_bc_.value();
}
block_decompressor bd(compression, data);
if (auto reqstr = bc->metadata_requirements(); !reqstr.empty()) {
auto req = compression_metadata_requirements<nlohmann::json>{reqstr};
block_decompressor bd(compression, data);
try {
req.check(bd.metadata());
} catch (std::exception const& e) {
@ -912,12 +910,19 @@ void filesystem_writer_<LoggerPolicy>::check_block_compression(
DWARFS_THROW(runtime_error, msg);
}
}
if (info) {
info->uncompressed_size = bd.uncompressed_size();
info->metadata = bd.metadata();
if (info->metadata) {
info->constraints = bc->get_compression_constraints(*info->metadata);
}
}
}
template <typename LoggerPolicy>
void filesystem_writer_<LoggerPolicy>::write_section(
section_type type, compression_type compression,
std::span<uint8_t const> data,
void filesystem_writer_<LoggerPolicy>::write_section_delayed_data(
section_type type, delayed_data_fn_type data, size_t uncompressed_size,
std::optional<fragment_category::value_type> cat) {
{
std::unique_lock lock(mx_);
@ -933,7 +938,8 @@ void filesystem_writer_<LoggerPolicy>::write_section(
auto& bc = get_compressor(type, cat);
auto fsb = std::make_unique<fsblock>(type, bc, data, compression, pctx_);
auto fsb = std::make_unique<fsblock>(type, bc, std::move(data),
uncompressed_size, pctx_);
fsb->set_block_no(section_number_++);
fsb->compress(wg_);
@ -944,6 +950,32 @@ void filesystem_writer_<LoggerPolicy>::write_section(
cond_.notify_one();
}
template <typename LoggerPolicy>
void filesystem_writer_<LoggerPolicy>::write_section(
section_type type, compression_type compression,
std::span<uint8_t const> data,
std::optional<fragment_category::value_type> cat) {
auto bd = block_decompressor(compression, data);
auto uncompressed_size = bd.uncompressed_size();
write_section_delayed_data(
type,
[bd = std::move(bd)]() mutable {
auto block = bd.start_decompression(malloc_byte_buffer::create());
bd.decompress_frame(bd.uncompressed_size());
return std::pair{std::move(block), bd.metadata()};
},
uncompressed_size, cat);
}
template <typename LoggerPolicy>
void filesystem_writer_<LoggerPolicy>::rewrite_block(
delayed_data_fn_type data, size_t uncompressed_size,
std::optional<fragment_category::value_type> cat) {
write_section_delayed_data(section_type::BLOCK, std::move(data),
uncompressed_size, cat);
}
template <typename LoggerPolicy>
void filesystem_writer_<LoggerPolicy>::write_compressed_section(
fs_section const& sec, std::span<uint8_t const> data) {

View File

@ -135,6 +135,7 @@ class metadata_builder_ final : public metadata_builder::impl {
uint32_t num_inodes) override;
void gather_global_entry_data(global_entry_data const& ge_data) override;
void remap_blocks(std::span<block_mapping const> mapping) override;
thrift::metadata::metadata const& build() override;
@ -269,6 +270,103 @@ void metadata_builder_<LoggerPolicy>::gather_global_entry_data(
md_.timestamp_base() = ge_data.get_timestamp_base();
}
template <typename LoggerPolicy>
void metadata_builder_<LoggerPolicy>::remap_blocks(
std::span<block_mapping const> mapping) {
using chunks_t = typename decltype(md_.chunks())::value_type;
using chunk_table_t = typename decltype(md_.chunk_table())::value_type;
using categories_t = typename decltype(md_.block_categories())::value_type;
using category_metadata_t =
typename decltype(md_.block_category_metadata())::value_type;
auto tv = LOG_TIMED_VERBOSE;
std::span<typename chunks_t::value_type> old_chunks = md_.chunks().value();
std::span<typename chunk_table_t::value_type> old_chunk_table =
md_.chunk_table().value();
DWARFS_CHECK(!old_chunk_table.empty(), "chunk table must not be empty");
chunks_t new_chunks;
chunk_table_t new_chunk_table;
new_chunk_table.push_back(0);
for (size_t i = 0; i < old_chunk_table.size() - 1; ++i) {
auto chunks = old_chunks.subspan(
old_chunk_table[i], old_chunk_table[i + 1] - old_chunk_table[i]);
std::vector<block_chunk> mapped_chunks;
for (auto const& chunk : chunks) {
DWARFS_CHECK(chunk.block().value() < mapping.size(),
"chunk block out of range");
auto mapped = mapping[chunk.block().value()].map_chunk(
chunk.offset().value(), chunk.size().value());
DWARFS_CHECK(!mapped.empty(), "mapped chunk list is empty");
auto first = mapped.begin();
if (!mapped_chunks.empty() &&
mapped_chunks.back().block == mapped.front().block &&
mapped_chunks.back().offset + mapped_chunks.back().size ==
mapped.front().offset) {
// merge with previous chunk
mapped_chunks.back().size += mapped.front().size;
++first;
}
mapped_chunks.insert(mapped_chunks.end(), first, mapped.end());
}
for (auto const& chunk : mapped_chunks) {
auto& nc = new_chunks.emplace_back();
nc.block() = chunk.block;
nc.offset() = chunk.offset;
nc.size() = chunk.size;
}
new_chunk_table.push_back(new_chunks.size());
}
auto const& old_categories = md_.block_categories();
auto const& old_category_metadata = md_.block_category_metadata();
if (old_categories.has_value() || old_category_metadata.has_value()) {
std::unordered_map<uint32_t, uint32_t> block_map;
for (auto const& m : mapping) {
for (auto const& c : m.chunks) {
block_map[c.block] = m.old_block;
}
}
if (old_categories.has_value()) {
categories_t new_categories;
new_categories.resize(block_map.size());
for (auto const& [new_block, old_block] : block_map) {
new_categories[new_block] = old_categories.value().at(old_block);
}
md_.block_categories() = std::move(new_categories);
}
if (old_category_metadata.has_value()) {
category_metadata_t new_category_metadata;
for (auto const& [new_block, old_block] : block_map) {
auto it = old_category_metadata.value().find(old_block);
if (it != old_category_metadata.value().end()) {
new_category_metadata[new_block] = it->second;
}
}
md_.block_category_metadata() = std::move(new_category_metadata);
}
}
md_.chunks() = std::move(new_chunks);
md_.chunk_table() = std::move(new_chunk_table);
tv << "remapping blocks...";
}
template <typename LoggerPolicy>
void metadata_builder_<LoggerPolicy>::update_inodes() {
bool const update_uid{options_.uid.has_value()};
@ -680,6 +778,33 @@ void metadata_builder_<LoggerPolicy>::upgrade_metadata(
} // namespace
std::vector<block_chunk>
block_mapping::map_chunk(size_t offset, size_t size) const {
std::vector<block_chunk> mapped;
size_t pos{0};
for (auto const& chunk : chunks) {
if (pos + chunk.size > offset) {
auto mapped_offset = offset - pos;
auto mapped_size = std::min(size, chunk.size - mapped_offset);
mapped.push_back(
{chunk.block, chunk.offset + mapped_offset, mapped_size});
size -= mapped_size;
if (size == 0) {
break;
}
offset += mapped_size;
}
pos += chunk.size;
}
DWARFS_CHECK(size == 0, "failed to map chunk, size mismatch");
return mapped;
}
metadata_builder::metadata_builder(logger& lgr, metadata_options const& options)
: impl_{
make_unique_logging_object<impl, metadata_builder_, logger_policies>(

View File

@ -420,7 +420,7 @@ int mkdwarfs_main(int argc, sys_char** argv, iolayer const& iol) {
bool no_progress = false, remove_header = false, no_section_index = false,
force_overwrite = false, no_history = false,
no_history_timestamps = false, no_history_command_line = false,
rebuild_metadata = false;
rebuild_metadata = false, change_block_size = false;
unsigned level;
int compress_niceness;
uint16_t uid, gid;
@ -531,6 +531,9 @@ int mkdwarfs_main(int argc, sys_char** argv, iolayer const& iol) {
("rebuild-metadata",
po::value<bool>(&rebuild_metadata)->zero_tokens(),
"fully rebuild metadata")
("change-block-size",
po::value<bool>(&change_block_size)->zero_tokens(),
"change block size when recompressing")
("recompress-categories",
po::value<std::string>(&recompress_categories),
"only recompress blocks of these categories")
@ -890,7 +893,8 @@ int mkdwarfs_main(int argc, sys_char** argv, iolayer const& iol) {
path = iol.os->canonical(path);
bool recompress = vm.contains("recompress") || rebuild_metadata;
bool recompress =
vm.contains("recompress") || rebuild_metadata || change_block_size;
utility::rewrite_options rw_opts;
if (recompress) {
std::unordered_map<std::string, unsigned> const modes{
@ -900,8 +904,12 @@ int mkdwarfs_main(int argc, sys_char** argv, iolayer const& iol) {
{"none", 0},
};
if (recompress_opts.empty() && rebuild_metadata) {
recompress_opts = "metadata";
if (recompress_opts.empty()) {
if (change_block_size) {
recompress_opts = "all";
} else if (rebuild_metadata) {
recompress_opts = "metadata";
}
}
if (auto it = modes.find(recompress_opts); it != modes.end()) {
@ -913,6 +921,12 @@ int mkdwarfs_main(int argc, sys_char** argv, iolayer const& iol) {
}
if (!recompress_categories.empty()) {
if (change_block_size) {
iol.err
<< "cannot use --recompress-categories with --change-block-size\n";
return 1;
}
std::string_view input = recompress_categories;
if (input.front() == '!') {
rw_opts.recompress_categories_exclude = true;
@ -1412,9 +1426,12 @@ int mkdwarfs_main(int argc, sys_char** argv, iolayer const& iol) {
try {
if (recompress) {
if (rebuild_metadata) {
if (rebuild_metadata || change_block_size) {
rw_opts.rebuild_metadata = options.metadata;
}
if (change_block_size) {
rw_opts.change_block_size = UINT64_C(1) << sf_config.block_size_bits;
}
utility::rewrite_filesystem(lgr, *input_filesystem, *fsw, *cat_resolver,
rw_opts);
} else {