metadata_v2: more filesystem cleanups

This commit is contained in:
Marcus Holland-Moritz 2020-11-27 17:38:26 +01:00
parent 8431a937cb
commit 1af13f4e62
4 changed files with 124 additions and 114 deletions

View File

@ -60,7 +60,7 @@ table :-)
*************************/
constexpr uint8_t MAJOR_VERSION = 0;
constexpr uint8_t MAJOR_VERSION = 1;
constexpr uint8_t MINOR_VERSION = 0;
enum class section_type : uint16_t {
@ -237,4 +237,7 @@ struct chunk_access {
};
std::string get_compression_name(compression_type type);
std::string get_section_name(section_type type);
} // namespace dwarfs

View File

@ -162,7 +162,7 @@ class metadata_v2 {
metadata_v2() = default;
metadata_v2(logger& lgr, folly::ByteRange schema, folly::ByteRange data,
const struct ::stat* defaults, int inode_offset = 0);
const struct ::stat* defaults = nullptr, int inode_offset = 0);
metadata_v2& operator=(metadata_v2&&) = default;

View File

@ -22,6 +22,9 @@
#include <cstddef>
#include <cstring>
#include <optional>
#include <unordered_map>
#include <folly/container/Enumerate.h>
#include <fmt/core.h>
@ -40,6 +43,11 @@ namespace {
class filesystem_parser {
public:
struct section {
size_t start{0};
section_header header;
};
filesystem_parser(std::shared_ptr<mmif> mm)
: mm_(mm)
, offset_(sizeof(file_header)) {
@ -49,8 +57,7 @@ class filesystem_parser {
const file_header* fh = mm_->as<file_header>();
if (::memcmp(&fh->magic[0], "DWARFS", 6) != 0 &&
::memcmp(&fh->magic[0], "NANOFS", 6) != 0) { // keep for compatibility
if (::memcmp(&fh->magic[0], "DWARFS", 6) != 0) {
throw std::runtime_error("magic not found");
}
@ -87,6 +94,15 @@ class filesystem_parser {
return false;
}
template <typename Logger>
std::optional<section> next_section(Logger& lgr) {
section rv;
if (next_section(rv.header, rv.start, lgr)) {
return rv;
}
return std::nullopt;
}
void rewind() { offset_ = sizeof(file_header); }
private:
@ -94,6 +110,46 @@ class filesystem_parser {
size_t offset_;
};
using section_map =
std::unordered_map<section_type, filesystem_parser::section>;
folly::ByteRange get_section_data(std::shared_ptr<mmif> mm,
filesystem_parser::section const& section,
std::vector<uint8_t>& buffer) {
if (section.header.compression == compression_type::NONE) {
return mm->range(section.start, section.header.length);
}
buffer = block_decompressor::decompress(section.header.compression,
mm->as<uint8_t>(section.start),
section.header.length);
return buffer;
}
metadata_v2
make_metadata(logger& lgr, std::shared_ptr<mmif> mm,
section_map const& sections, std::vector<uint8_t>& schema_buffer,
std::vector<uint8_t>& meta_buffer,
const struct ::stat* stat_defaults = nullptr,
int inode_offset = 0) {
auto schema_it = sections.find(section_type::METADATA_V2_SCHEMA);
auto meta_it = sections.find(section_type::METADATA_V2);
if (schema_it == sections.end()) {
throw std::runtime_error("no metadata schema found");
}
if (meta_it == sections.end()) {
throw std::runtime_error("no metadata found");
}
return metadata_v2(lgr,
get_section_data(mm, schema_it->second, schema_buffer),
get_section_data(mm, meta_it->second, meta_buffer),
stat_defaults, inode_offset);
}
template <typename LoggerPolicy>
class filesystem_ : public filesystem_v2::impl {
public:
@ -142,72 +198,24 @@ filesystem_<LoggerPolicy>::filesystem_(logger& lgr, std::shared_ptr<mmif> mm,
filesystem_parser parser(mm_);
block_cache cache(lgr, bc_options);
section_header sh;
section_header sh_schema;
section_header sh_data;
size_t start;
size_t start_schema = 0;
size_t start_data = 0;
section_map sections;
while (parser.next_section(sh, start, log_)) {
switch (sh.type) {
case section_type::BLOCK:
cache.insert(sh.compression, mm_->as<uint8_t>(start),
static_cast<size_t>(sh.length));
break;
case section_type::METADATA:
// TODO: ignore for now, fail later
break;
case section_type::METADATA_V2_SCHEMA:
sh_schema = sh;
start_schema = start;
break;
case section_type::METADATA_V2:
sh_data = sh;
start_data = start;
break;
default:
throw std::runtime_error("unknown section");
while (auto s = parser.next_section(log_)) {
if (s->header.type == section_type::BLOCK) {
cache.insert(s->header.compression, mm_->as<uint8_t>(s->start),
static_cast<size_t>(s->header.length));
} else {
if (!sections.emplace(s->header.type, *s).second) {
throw std::runtime_error("duplicate section: " +
get_section_name(s->header.type));
}
}
}
if (start_schema == 0 || sh_schema.length == 0) {
throw std::runtime_error("no metadata schema found");
}
std::vector<uint8_t> schema_buffer;
if (start_data == 0) {
throw std::runtime_error("no metadata found");
}
folly::ByteRange schema;
folly::ByteRange data;
std::vector<uint8_t> schema_buf;
if (sh_data.compression == compression_type::NONE) {
data = mm_->range(start_data, sh_data.length);
} else {
meta_buffer_ = block_decompressor::decompress(
sh_data.compression, mm_->as<uint8_t>(start_data), sh_data.length);
data = meta_buffer_;
}
if (sh_schema.compression == compression_type::NONE) {
schema = mm_->range(start_schema, sh_schema.length);
} else {
schema_buf = block_decompressor::decompress(sh_schema.compression,
mm_->as<uint8_t>(start_schema),
sh_schema.length);
schema = schema_buf;
}
log_.info() << "schema: " << schema.size() << " (" << sh_schema.length
<< "), data: " << data.size() << " (" << sh_data.length << ")";
meta_ = metadata_v2(lgr, schema, data, stat_defaults, inode_offset);
meta_ = make_metadata(lgr, mm_, sections, schema_buffer, meta_buffer_,
stat_defaults, inode_offset);
log_.debug() << "read " << cache.block_count() << " blocks and "
<< meta_.size() << " bytes of metadata";
@ -344,54 +352,42 @@ void filesystem_v2::rewrite(logger& lgr, progress& prog,
log_proxy<debug_logger_policy> log(lgr);
filesystem_parser parser(mm);
section_header sh;
size_t start;
std::vector<uint8_t> meta_raw;
metadata_v2 meta;
section_map sections;
while (parser.next_section(sh, start, log)) {
if (sh.type == section_type::METADATA) {
// TODO: only decompress if needed
meta_raw = block_decompressor::decompress(
sh.compression, mm->as<uint8_t>(start), sh.length);
// TODO: FIXME:
// meta = metadata_v2(lgr, meta_raw, nullptr);
break;
} else {
while (auto s = parser.next_section(log)) {
if (s->header.type == section_type::BLOCK) {
++prog.block_count;
} else {
if (!sections.emplace(s->header.type, *s).second) {
throw std::runtime_error("duplicate section: " +
get_section_name(s->header.type));
}
}
}
std::vector<uint8_t> schema_raw;
std::vector<uint8_t> meta_raw;
auto meta = make_metadata(lgr, mm, sections, schema_raw, meta_raw);
struct ::statvfs stbuf;
meta.statvfs(&stbuf);
prog.original_size = stbuf.f_blocks * stbuf.f_frsize;
parser.rewind();
while (parser.next_section(sh, start, log)) {
while (auto s = parser.next_section(log)) {
// TODO: multi-thread this?
switch (sh.type) {
case section_type::BLOCK: {
if (s->header.type == section_type::BLOCK) {
auto block = block_decompressor::decompress(
sh.compression, mm->as<uint8_t>(start), sh.length);
s->header.compression, mm->as<uint8_t>(s->start), s->header.length);
prog.filesystem_size += block.size();
writer.write_block(std::move(block));
break;
}
case section_type::METADATA:
writer.write_metadata(std::move(meta_raw));
break;
case section_type::METADATA_V2:
// TODO...
break;
default:
throw std::runtime_error("unknown section");
}
}
writer.write_metadata_v2_schema(std::move(schema_raw));
writer.write_metadata_v2(std::move(meta_raw));
writer.flush();
}
@ -401,31 +397,37 @@ void filesystem_v2::identify(logger& lgr, std::shared_ptr<mmif> mm,
log_proxy<debug_logger_policy> log(lgr);
filesystem_parser parser(mm);
section_header sh;
size_t start;
section_map sections;
while (parser.next_section(sh, start, log)) {
while (auto s = parser.next_section(log)) {
std::vector<uint8_t> tmp;
block_decompressor bd(sh.compression, mm->as<uint8_t>(start), sh.length,
tmp);
float compression_ratio = float(sh.length) / bd.uncompressed_size();
block_decompressor bd(s->header.compression, mm->as<uint8_t>(s->start),
s->header.length, tmp);
float compression_ratio = float(s->header.length) / bd.uncompressed_size();
os << "SECTION " << sh.to_string()
os << "SECTION " << s->header.to_string()
<< ", blocksize=" << bd.uncompressed_size()
<< ", ratio=" << fmt::format("{:.2%}%", compression_ratio) << std::endl;
<< ", ratio=" << fmt::format("{:.2f}%", compression_ratio) << std::endl;
// TODO: do we need this?
// if (sh.type == section_type::METADATA_V2) {
// // TODO: only decompress if needed
// bd.decompress_frame(bd.uncompressed_size());
// metadata_v2 meta(lgr, tmp, nullptr);
// struct ::statvfs stbuf;
// meta.statvfs(&stbuf);
// os << "block size: " << stbuf.f_bsize << std::endl;
// os << "inode count: " << stbuf.f_files << std::endl;
// os << "original filesystem size: " << stbuf.f_blocks << std::endl;
// }
if (s->header.type != section_type::BLOCK) {
if (!sections.emplace(s->header.type, *s).second) {
throw std::runtime_error("duplicate section: " +
get_section_name(s->header.type));
}
}
}
std::vector<uint8_t> schema_raw;
std::vector<uint8_t> meta_raw;
auto meta = make_metadata(lgr, mm, sections, schema_raw, meta_raw);
struct ::statvfs stbuf;
meta.statvfs(&stbuf);
os << "block size: " << stbuf.f_bsize << std::endl;
os << "inode count: " << stbuf.f_files << std::endl;
os << "original filesystem size: " << stbuf.f_blocks << std::endl;
}
} // namespace dwarfs

View File

@ -45,6 +45,7 @@ const std::map<section_type, std::string> sections{
#undef SECTION_TYPE_
};
// TODO: remove
const std::map<dir_entry_type, std::string> dir_entries{
#define DIR_ENTRY_TYPE_(x) {dir_entry_type::x, #x}
DIR_ENTRY_TYPE_(DIR_ENTRY), DIR_ENTRY_TYPE_(DIR_ENTRY_UG),
@ -76,6 +77,10 @@ std::string get_compression_name(compression_type type) {
return get_default(compressions, type);
}
std::string get_section_name(section_type type) {
return get_default(sections, type);
}
void section_header::dump(std::ostream& os) const {
os << "type=" << get_default(sections, type)
<< ", compression=" << get_default(compressions, compression)