refactor(metadata_analyzer): clean up analyzer code

This commit is contained in:
Marcus Holland-Moritz 2025-05-20 13:01:43 +02:00
parent 1188b8563c
commit d87bae1c08

View File

@ -47,8 +47,22 @@ using namespace dwarfs::internal;
using namespace ::apache::thrift;
using namespace ::apache::thrift::frozen;
#if FMT_VERSION >= 70000
#define DWARFS_FMT_L "L"
#else
#define DWARFS_FMT_L "n"
#endif
namespace {
template <typename T>
concept list_view_type = requires(T a) {
requires std::derived_from<
T, typename frozen::detail::ArrayLayout<
std::decay_t<decltype(a.thaw())>,
typename std::decay_t<decltype(a.thaw())>::value_type>::View>;
};
Layout<thrift::metadata::metadata> const&
get_layout(MappedFrozen<thrift::metadata::metadata> const& meta) {
auto layout = meta.findFirstOfType<
@ -57,11 +71,17 @@ get_layout(MappedFrozen<thrift::metadata::metadata> const& meta) {
return **layout;
}
void analyze_frozen(std::ostream& os,
MappedFrozen<thrift::metadata::metadata> const& meta,
size_t total_size, bool verbose) {
null_logger lgr;
class frozen_analyzer {
public:
frozen_analyzer(MappedFrozen<thrift::metadata::metadata> const& meta,
std::span<uint8_t const> data, bool verbose)
: meta_{meta}
, total_size_{data.size()}
, verbose_{verbose} {}
void print(std::ostream& os) const;
private:
struct usage_info {
usage_info(size_t off, size_t sz, std::string text)
: offset{off}
@ -73,161 +93,193 @@ void analyze_frozen(std::ostream& os,
std::string line;
};
auto& l = get_layout(meta);
std::vector<usage_info> usage;
using detail_bits_t = std::pair<std::string_view, size_t>;
#if FMT_VERSION >= 70000
#define DWARFS_FMT_L "L"
#else
#define DWARFS_FMT_L "n"
#endif
auto get_offset = [&](auto const* ptr) {
return ptr ? reinterpret_cast<uintptr_t>(ptr) -
reinterpret_cast<uintptr_t>(meta.getPosition().start)
: 0;
};
auto get_list_offset = [&](auto const& v) {
struct view_internal {
void const* layout;
uint8_t const* start;
size_t bitOffset;
uint8_t const* data;
size_t count;
};
using list_t = std::decay_t<decltype(v.thaw())>;
static_assert(sizeof(v) == sizeof(view_internal));
static_assert(
std::derived_from<std::decay_t<decltype(v)>,
typename frozen::detail::ArrayLayout<
list_t, typename list_t::value_type>::View>);
auto const* vi = reinterpret_cast<view_internal const*>(&v);
DWARFS_CHECK(vi->count == v.size(), "internal error: size mismatch");
return get_offset(vi->data);
};
auto fmt_size = [&](std::string_view name, std::optional<size_t> count_opt,
size_t size) {
auto count = count_opt.value_or(1);
std::string count_str;
if (count_opt.has_value()) {
count_str = fmt::format("{0:" DWARFS_FMT_L "}", count);
}
return fmt::format("{0:>14} {1:.<24}{2:.>16" DWARFS_FMT_L
"} bytes {3:5.1f}% {4:5.1f} bytes/item\n",
count_str, name, size, 100.0 * size / total_size,
count > 0 ? static_cast<double>(size) / count : 0.0);
};
auto fmt_detail = [&](std::string_view name, size_t count, size_t size,
std::optional<size_t> offset, std::string num) {
std::string range;
if (verbose) {
if (offset) {
range = fmt::format(" {:08x}..{:08x} ", *offset, *offset + size);
} else {
range.append(21, ' ');
}
}
range.append(15, ' ');
return fmt::format("{0}{1:<24}{2:>16" DWARFS_FMT_L "} bytes {3:>6} "
"{4:5.1f} bytes/item\n",
range, name, size, num,
count > 0 ? static_cast<double>(size) / count : 0.0);
};
auto fmt_detail_pct = [&](std::string_view name, size_t count, size_t size,
std::optional<size_t> offset = std::nullopt) {
return fmt_detail(name, count, size, offset,
fmt::format("{0:5.1f}%", 100.0 * size / total_size));
};
auto add_size = [&](std::string_view name, size_t count, size_t offset,
size_t size) {
usage.emplace_back(offset, size, fmt_size(name, count, size));
};
auto add_size_unique = [&](std::string_view name, size_t offset,
size_t size) {
usage.emplace_back(offset, size, fmt_size(name, std::nullopt, size));
};
auto list_size = [&](auto const& list, auto const& field) {
static size_t list_size(list_view_type auto const& list, auto const& field) {
return (list.size() * field.layout.itemField.layout.bits + 7) / 8;
};
auto add_list_size = [&](std::string_view name, auto const& list,
auto const& field) {
add_size(name, list.size(), get_list_offset(list), list_size(list, field));
};
size_t get_offset(auto const* ptr) const;
size_t get_list_offset(list_view_type auto const& v) const;
auto add_string_list_size = [&](std::string_view name, auto const& list,
auto const& field) {
auto count = list.size();
if (count > 0) {
auto index_size = list_size(list, field);
auto data_size = list.back().end() - list.front().begin();
auto size = index_size + data_size;
auto fmt = fmt_size(name, count, size) +
fmt_detail_pct("|- data", count, data_size) +
fmt_detail_pct("'- index", count, index_size);
usage.emplace_back(get_list_offset(list), size, fmt);
std::string fmt_size(std::string_view name, std::optional<size_t> count_opt,
size_t size) const;
std::string fmt_detail(std::string_view name, size_t count, size_t size,
std::optional<size_t> offset, std::string num) const;
std::string fmt_detail_pct(std::string_view name, size_t count, size_t size,
std::optional<size_t> offset = std::nullopt) const;
void add_size(std::vector<usage_info>& usage, std::string_view name,
size_t count, size_t offset, size_t size) const;
void add_size_unique(std::vector<usage_info>& usage, std::string_view name,
size_t offset, size_t size) const;
void add_list_size(std::vector<usage_info>& usage, std::string_view name,
auto const& list, auto const& field) const;
void
add_string_list_size(std::vector<usage_info>& usage, std::string_view name,
auto const& list, auto const& field) const;
void
add_string_table_size(std::vector<usage_info>& usage, std::string_view name,
auto const& table, auto const& field) const;
void summarize_details(std::vector<usage_info>& usage, std::string_view name,
size_t count, size_t offset, size_t size,
std::span<detail_bits_t> details) const;
MappedFrozen<thrift::metadata::metadata> const& meta_;
size_t total_size_{0};
bool verbose_{false};
};
size_t frozen_analyzer::get_offset(auto const* ptr) const {
return ptr ? reinterpret_cast<uintptr_t>(ptr) -
reinterpret_cast<uintptr_t>(meta_.getPosition().start)
: 0;
}
size_t frozen_analyzer::get_list_offset(list_view_type auto const& v) const {
struct view_internal {
void const* layout;
uint8_t const* start;
size_t bitOffset;
uint8_t const* data;
size_t count;
};
static_assert(sizeof(v) == sizeof(view_internal));
auto const* vi = reinterpret_cast<view_internal const*>(&v);
DWARFS_CHECK(vi->count == v.size(), "internal error: size mismatch");
return get_offset(vi->data);
}
std::string
frozen_analyzer::fmt_size(std::string_view name,
std::optional<size_t> count_opt, size_t size) const {
auto count = count_opt.value_or(1);
std::string count_str;
if (count_opt.has_value()) {
count_str = fmt::format("{0:" DWARFS_FMT_L "}", count);
}
return fmt::format("{0:>14} {1:.<24}{2:.>16" DWARFS_FMT_L
"} bytes {3:5.1f}% {4:5.1f} bytes/item\n",
count_str, name, size, 100.0 * size / total_size_,
count > 0 ? static_cast<double>(size) / count : 0.0);
}
std::string
frozen_analyzer::fmt_detail(std::string_view name, size_t count, size_t size,
std::optional<size_t> offset,
std::string num) const {
std::string range;
if (verbose_) {
if (offset) {
range = fmt::format(" {:08x}..{:08x} ", *offset, *offset + size);
} else {
range.append(21, ' ');
}
};
}
range.append(15, ' ');
return fmt::format("{0}{1:<24}{2:>16" DWARFS_FMT_L "} bytes {3:>6} "
"{4:5.1f} bytes/item\n",
range, name, size, num,
count > 0 ? static_cast<double>(size) / count : 0.0);
}
auto add_string_table_size = [&](std::string_view name, auto const& table,
auto const& field) {
if (auto data_size = table.buffer().size(); data_size > 0) {
auto dict_size =
table.symtab() ? table.symtab()->size() : static_cast<size_t>(0);
auto index_size = list_size(table.index(), field.layout.indexField);
auto size = index_size + data_size + dict_size;
auto count = table.index().size() - (table.packed_index() ? 0 : 1);
auto fmt = fmt_size(name, count, size) +
fmt_detail_pct("|- data", count, data_size,
get_offset(table.buffer().data()));
if (table.symtab()) {
string_table st(lgr, "tmp", table);
auto unpacked_size = st.unpacked_size();
fmt += fmt_detail(
"|- unpacked", count, unpacked_size, std::nullopt,
fmt::format("{0:5.2f}x",
static_cast<double>(unpacked_size) / data_size));
fmt += fmt_detail_pct("|- dict", count, dict_size,
get_offset(table.symtab()->data()));
}
fmt += fmt_detail_pct("'- index", count, index_size,
get_list_offset(table.index()));
usage.emplace_back(get_offset(table.buffer().data()), size, fmt);
std::string
frozen_analyzer::fmt_detail_pct(std::string_view name, size_t count,
size_t size,
std::optional<size_t> offset) const {
return fmt_detail(name, count, size, offset,
fmt::format("{0:5.1f}%", 100.0 * size / total_size_));
};
void frozen_analyzer::add_size(std::vector<usage_info>& usage,
std::string_view name, size_t count,
size_t offset, size_t size) const {
usage.emplace_back(offset, size, fmt_size(name, count, size));
}
void frozen_analyzer::add_size_unique(std::vector<usage_info>& usage,
std::string_view name, size_t offset,
size_t size) const {
usage.emplace_back(offset, size, fmt_size(name, std::nullopt, size));
}
void frozen_analyzer::add_list_size(std::vector<usage_info>& usage,
std::string_view name, auto const& list,
auto const& field) const {
add_size(usage, name, list.size(), get_list_offset(list),
list_size(list, field));
}
void frozen_analyzer::add_string_list_size(std::vector<usage_info>& usage,
std::string_view name,
auto const& list,
auto const& field) const {
if (auto count = list.size(); count > 0) {
auto index_size = list_size(list, field);
auto data_size = list.back().end() - list.front().begin();
auto size = index_size + data_size;
auto fmt = fmt_size(name, count, size) +
fmt_detail_pct("|- data", count, data_size) +
fmt_detail_pct("'- index", count, index_size);
usage.emplace_back(get_list_offset(list), size, fmt);
}
}
void frozen_analyzer::add_string_table_size(std::vector<usage_info>& usage,
std::string_view name,
auto const& table,
auto const& field) const {
if (auto data_size = table.buffer().size(); data_size > 0) {
auto dict_size =
table.symtab() ? table.symtab()->size() : static_cast<size_t>(0);
auto index_size = list_size(table.index(), field.layout.indexField);
auto size = index_size + data_size + dict_size;
auto count = table.index().size() - (table.packed_index() ? 0 : 1);
auto fmt = fmt_size(name, count, size) +
fmt_detail_pct("|- data", count, data_size,
get_offset(table.buffer().data()));
if (table.symtab()) {
null_logger lgr;
string_table st(lgr, "tmp", table);
auto unpacked_size = st.unpacked_size();
fmt += fmt_detail(
"|- unpacked", count, unpacked_size, std::nullopt,
fmt::format("{0:5.2f}x",
static_cast<double>(unpacked_size) / data_size));
fmt += fmt_detail_pct("|- dict", count, dict_size,
get_offset(table.symtab()->data()));
}
};
fmt += fmt_detail_pct("'- index", count, index_size,
get_list_offset(table.index()));
usage.emplace_back(get_offset(table.buffer().data()), size, fmt);
}
}
using detail_bits_t = std::pair<std::string_view, size_t>;
void frozen_analyzer::summarize_details(
std::vector<usage_info>& usage, std::string_view name, size_t count,
size_t offset, size_t size, std::span<detail_bits_t> details) const {
std::ranges::stable_sort(details, std::ranges::greater{},
&detail_bits_t::second);
auto fmt = fmt_size(name, count, size);
for (size_t i = 0; i < details.size(); ++i) {
auto [member, bits] = details[i];
auto tree = i == details.size() - 1 ? "'" : "|";
fmt += fmt_detail_pct(fmt::format("{}- {} [{}]", tree, member, bits), count,
(count * bits + 7) / 8);
}
usage.emplace_back(offset, size, fmt);
}
auto summarize_details = [&](std::string_view name, size_t count,
size_t offset, size_t size,
std::span<detail_bits_t> details) {
std::ranges::stable_sort(details, std::ranges::greater{},
&detail_bits_t::second);
auto fmt = fmt_size(name, count, size);
for (size_t i = 0; i < details.size(); ++i) {
auto [member, bits] = details[i];
auto tree = i == details.size() - 1 ? "'" : "|";
fmt += fmt_detail_pct(fmt::format("{}- {} [{}]", tree, member, bits),
count, (count * bits + 7) / 8);
}
usage.emplace_back(offset, size, fmt);
};
#define META_LIST_SIZE(x) add_list_size(usage, #x, meta_.x(), l.x##Field)
#define META_LIST_SIZE(x) add_list_size(#x, meta.x(), l.x##Field)
#define META_STRING_LIST_SIZE(x) add_string_list_size(#x, meta.x(), l.x##Field)
#define META_STRING_LIST_SIZE(x) \
add_string_list_size(usage, #x, meta_.x(), l.x##Field)
#define META_OPT_LIST_SIZE(x) \
do { \
if (auto list = meta.x()) { \
add_list_size(#x, *list, l.x##Field.layout.valueField); \
if (auto list = meta_.x()) { \
add_list_size(usage, #x, *list, l.x##Field.layout.valueField); \
} \
} while (0)
@ -235,8 +287,8 @@ void analyze_frozen(std::ostream& os,
#define META_OPT_STRING_LIST_SIZE(x) \
do { \
if (auto list = meta.x()) { \
add_string_list_size(#x, *list, l.x##Field.layout.valueField); \
if (auto list = meta_.x()) { \
add_string_list_size(usage, #x, *list, l.x##Field.layout.valueField); \
} \
} while (0)
@ -244,8 +296,8 @@ void analyze_frozen(std::ostream& os,
#define META_OPT_STRING_TABLE_SIZE(x) \
do { \
if (auto table = meta.x()) { \
add_string_table_size(#x, *table, l.x##Field.layout.valueField); \
if (auto table = meta_.x()) { \
add_string_table_size(usage, #x, *table, l.x##Field.layout.valueField); \
} \
} while (0)
@ -263,14 +315,14 @@ void analyze_frozen(std::ostream& os,
} while (0)
#define META_LIST_SIZE_DETAIL_END(x) \
summarize_details(#x, meta.x().size(), get_list_offset(meta.x()), \
list_size(meta.x(), l.x##Field), detail_bits); \
summarize_details(usage, #x, meta_.x().size(), get_list_offset(meta_.x()), \
list_size(meta_.x(), l.x##Field), detail_bits); \
} \
while (0)
#define META_OPT_LIST_SIZE_DETAIL_BEGIN(x) \
do { \
if (auto list = meta.x()) { \
if (auto list = meta_.x()) { \
std::vector<detail_bits_t> detail_bits;
#define META_OPT_ADD_DETAIL_BITS(field, x) \
@ -283,13 +335,17 @@ void analyze_frozen(std::ostream& os,
} while (0)
#define META_OPT_LIST_SIZE_DETAIL_END(x) \
summarize_details(#x, list->size(), get_list_offset(*list), \
summarize_details(usage, #x, list->size(), get_list_offset(*list), \
list_size(*list, l.x##Field.layout.valueField), \
detail_bits); \
} \
} \
while (0)
void frozen_analyzer::print(std::ostream& os) const {
auto& l = get_layout(meta_);
std::vector<usage_info> usage;
META_LIST_SIZE_DETAIL_BEGIN;
META_ADD_DETAIL_BITS(chunks, block);
META_ADD_DETAIL_BITS(chunks, offset);
@ -319,7 +375,7 @@ void analyze_frozen(std::ostream& os,
META_OPT_LIST_SIZE_DETAIL_END(dir_entries);
META_LIST_SIZE(chunk_table);
if (!meta.entry_table_v2_2().empty()) {
if (!meta_.entry_table_v2_2().empty()) {
// deprecated, so only list if non-empty
META_LIST_SIZE(entry_table_v2_2);
}
@ -344,6 +400,59 @@ void analyze_frozen(std::ostream& os,
META_OPT_STRING_LIST_SIZE(category_metadata_json);
META_OPT_MAP_SIZE(block_category_metadata);
if (auto cache = meta_.reg_file_size_cache()) {
add_list_size(
usage, "inode_size_cache", cache->lookup(),
l.reg_file_size_cacheField.layout.valueField.layout.lookupField);
}
if (auto list = meta_.metadata_version_history()) {
size_t history_size =
list_size(*list, l.metadata_version_historyField.layout.valueField);
for (auto const& entry : *list) {
if (entry.dwarfs_version()) {
history_size += entry.dwarfs_version()->size();
}
}
add_size(usage, "metadata_version_history", list->size(),
get_list_offset(*list), history_size);
}
if (auto version = meta_.dwarfs_version()) {
add_size_unique(usage, "dwarfs_version", get_offset(version->data()),
version->size());
}
add_size_unique(usage, "metadata_root", 0, l.size);
add_size_unique(usage, "padding", total_size_ - LayoutRoot::kPaddingBytes,
LayoutRoot::kPaddingBytes);
std::ranges::sort(usage, [this](auto const& a, auto const& b) {
if (verbose_) {
return a.offset < b.offset ||
(a.offset == b.offset &&
(a.size < b.size || (a.size == b.size && a.line < b.line)));
}
return a.size > b.size || (a.size == b.size && a.line < b.line);
});
os << "metadata memory usage:\n";
if (verbose_) {
os << fmt::format(" {:08x}..{:08x} ", 0, total_size_);
}
os << fmt::format(" {0:.<24}{1:.>16" DWARFS_FMT_L
"} bytes {2:6.1f} bytes/inode\n",
"total metadata", total_size_,
static_cast<double>(total_size_) / meta_.inodes().size());
for (auto const& u : usage) {
if (verbose_) {
os << fmt::format(" {:08x}..{:08x} ", u.offset, u.offset + u.size);
}
os << u.line;
}
}
#undef META_LIST_SIZE
#undef META_OPT_STRING_SET_SIZE
#undef META_OPT_STRING_LIST_SIZE
@ -357,61 +466,6 @@ void analyze_frozen(std::ostream& os,
#undef META_OPT_ADD_DETAIL_BITS
#undef META_OPT_LIST_SIZE_DETAIL_END
if (auto cache = meta.reg_file_size_cache()) {
add_list_size(
"inode_size_cache", cache->lookup(),
l.reg_file_size_cacheField.layout.valueField.layout.lookupField);
}
if (auto list = meta.metadata_version_history()) {
size_t history_size =
list_size(*list, l.metadata_version_historyField.layout.valueField);
for (auto const& entry : *list) {
if (entry.dwarfs_version()) {
history_size += entry.dwarfs_version()->size();
}
}
add_size("metadata_version_history", list->size(), get_list_offset(*list),
history_size);
}
if (auto version = meta.dwarfs_version()) {
add_size_unique("dwarfs_version", get_offset(version->data()),
version->size());
}
add_size_unique("metadata_root", 0, l.size);
add_size_unique("padding", total_size - LayoutRoot::kPaddingBytes,
LayoutRoot::kPaddingBytes);
std::ranges::sort(usage, [verbose](auto const& a, auto const& b) {
if (verbose) {
return a.offset < b.offset ||
(a.offset == b.offset &&
(a.size < b.size || (a.size == b.size && a.line < b.line)));
}
return a.size > b.size || (a.size == b.size && a.line < b.line);
});
os << "metadata memory usage:\n";
if (verbose) {
os << fmt::format(" {:08x}..{:08x} ", 0, total_size);
}
os << fmt::format(" {0:.<24}{1:.>16" DWARFS_FMT_L
"} bytes {2:6.1f} bytes/inode\n",
"total metadata", total_size,
static_cast<double>(total_size) / meta.inodes().size());
#undef DWARFS_FMT_L
for (auto const& u : usage) {
if (verbose) {
os << fmt::format(" {:08x}..{:08x} ", u.offset, u.offset + u.size);
}
os << u.line;
}
}
} // namespace
metadata_analyzer::metadata_analyzer(
@ -426,7 +480,7 @@ void metadata_analyzer::print_layout(std::ostream& os) const {
}
void metadata_analyzer::print_frozen(std::ostream& os, bool verbose) const {
analyze_frozen(os, meta_, data_.size(), verbose);
frozen_analyzer(meta_, data_, verbose).print(os);
}
} // namespace dwarfs::reader::internal