From 1af13f4e6209c9e7b7696822e7cb27e6d35b02ff Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Fri, 27 Nov 2020 17:38:26 +0100 Subject: [PATCH] metadata_v2: more filesystem cleanups --- include/dwarfs/fstypes.h | 5 +- include/dwarfs/metadata_v2.h | 2 +- src/dwarfs/filesystem_v2.cpp | 226 ++++++++++++++++++----------------- src/dwarfs/fstypes.cpp | 5 + 4 files changed, 124 insertions(+), 114 deletions(-) diff --git a/include/dwarfs/fstypes.h b/include/dwarfs/fstypes.h index d261fe0a..962385c7 100644 --- a/include/dwarfs/fstypes.h +++ b/include/dwarfs/fstypes.h @@ -60,7 +60,7 @@ table :-) *************************/ -constexpr uint8_t MAJOR_VERSION = 0; +constexpr uint8_t MAJOR_VERSION = 1; constexpr uint8_t MINOR_VERSION = 0; enum class section_type : uint16_t { @@ -237,4 +237,7 @@ struct chunk_access { }; std::string get_compression_name(compression_type type); + +std::string get_section_name(section_type type); + } // namespace dwarfs diff --git a/include/dwarfs/metadata_v2.h b/include/dwarfs/metadata_v2.h index e66e21a8..7d535edf 100644 --- a/include/dwarfs/metadata_v2.h +++ b/include/dwarfs/metadata_v2.h @@ -162,7 +162,7 @@ class metadata_v2 { metadata_v2() = default; metadata_v2(logger& lgr, folly::ByteRange schema, folly::ByteRange data, - const struct ::stat* defaults, int inode_offset = 0); + const struct ::stat* defaults = nullptr, int inode_offset = 0); metadata_v2& operator=(metadata_v2&&) = default; diff --git a/src/dwarfs/filesystem_v2.cpp b/src/dwarfs/filesystem_v2.cpp index 57196c0a..6c57c6de 100644 --- a/src/dwarfs/filesystem_v2.cpp +++ b/src/dwarfs/filesystem_v2.cpp @@ -22,6 +22,9 @@ #include #include +#include +#include + #include #include @@ -40,6 +43,11 @@ namespace { class filesystem_parser { public: + struct section { + size_t start{0}; + section_header header; + }; + filesystem_parser(std::shared_ptr mm) : mm_(mm) , offset_(sizeof(file_header)) { @@ -49,8 +57,7 @@ class filesystem_parser { const file_header* fh = mm_->as(); - if (::memcmp(&fh->magic[0], "DWARFS", 6) != 0 && - ::memcmp(&fh->magic[0], "NANOFS", 6) != 0) { // keep for compatibility + if (::memcmp(&fh->magic[0], "DWARFS", 6) != 0) { throw std::runtime_error("magic not found"); } @@ -87,6 +94,15 @@ class filesystem_parser { return false; } + template + std::optional
next_section(Logger& lgr) { + section rv; + if (next_section(rv.header, rv.start, lgr)) { + return rv; + } + return std::nullopt; + } + void rewind() { offset_ = sizeof(file_header); } private: @@ -94,6 +110,46 @@ class filesystem_parser { size_t offset_; }; +using section_map = + std::unordered_map; + +folly::ByteRange get_section_data(std::shared_ptr mm, + filesystem_parser::section const& section, + std::vector& buffer) { + if (section.header.compression == compression_type::NONE) { + return mm->range(section.start, section.header.length); + } + + buffer = block_decompressor::decompress(section.header.compression, + mm->as(section.start), + section.header.length); + + return buffer; +} + +metadata_v2 +make_metadata(logger& lgr, std::shared_ptr mm, + section_map const& sections, std::vector& schema_buffer, + std::vector& meta_buffer, + const struct ::stat* stat_defaults = nullptr, + int inode_offset = 0) { + auto schema_it = sections.find(section_type::METADATA_V2_SCHEMA); + auto meta_it = sections.find(section_type::METADATA_V2); + + if (schema_it == sections.end()) { + throw std::runtime_error("no metadata schema found"); + } + + if (meta_it == sections.end()) { + throw std::runtime_error("no metadata found"); + } + + return metadata_v2(lgr, + get_section_data(mm, schema_it->second, schema_buffer), + get_section_data(mm, meta_it->second, meta_buffer), + stat_defaults, inode_offset); +} + template class filesystem_ : public filesystem_v2::impl { public: @@ -142,72 +198,24 @@ filesystem_::filesystem_(logger& lgr, std::shared_ptr mm, filesystem_parser parser(mm_); block_cache cache(lgr, bc_options); - section_header sh; - section_header sh_schema; - section_header sh_data; - size_t start; - size_t start_schema = 0; - size_t start_data = 0; + section_map sections; - while (parser.next_section(sh, start, log_)) { - switch (sh.type) { - case section_type::BLOCK: - cache.insert(sh.compression, mm_->as(start), - static_cast(sh.length)); - break; - - case section_type::METADATA: - // TODO: ignore for now, fail later - break; - - case section_type::METADATA_V2_SCHEMA: - sh_schema = sh; - start_schema = start; - break; - - case section_type::METADATA_V2: - sh_data = sh; - start_data = start; - break; - - default: - throw std::runtime_error("unknown section"); + while (auto s = parser.next_section(log_)) { + if (s->header.type == section_type::BLOCK) { + cache.insert(s->header.compression, mm_->as(s->start), + static_cast(s->header.length)); + } else { + if (!sections.emplace(s->header.type, *s).second) { + throw std::runtime_error("duplicate section: " + + get_section_name(s->header.type)); + } } } - if (start_schema == 0 || sh_schema.length == 0) { - throw std::runtime_error("no metadata schema found"); - } + std::vector schema_buffer; - if (start_data == 0) { - throw std::runtime_error("no metadata found"); - } - - folly::ByteRange schema; - folly::ByteRange data; - std::vector schema_buf; - - if (sh_data.compression == compression_type::NONE) { - data = mm_->range(start_data, sh_data.length); - } else { - meta_buffer_ = block_decompressor::decompress( - sh_data.compression, mm_->as(start_data), sh_data.length); - data = meta_buffer_; - } - - if (sh_schema.compression == compression_type::NONE) { - schema = mm_->range(start_schema, sh_schema.length); - } else { - schema_buf = block_decompressor::decompress(sh_schema.compression, - mm_->as(start_schema), - sh_schema.length); - schema = schema_buf; - } - - log_.info() << "schema: " << schema.size() << " (" << sh_schema.length - << "), data: " << data.size() << " (" << sh_data.length << ")"; - - meta_ = metadata_v2(lgr, schema, data, stat_defaults, inode_offset); + meta_ = make_metadata(lgr, mm_, sections, schema_buffer, meta_buffer_, + stat_defaults, inode_offset); log_.debug() << "read " << cache.block_count() << " blocks and " << meta_.size() << " bytes of metadata"; @@ -344,54 +352,42 @@ void filesystem_v2::rewrite(logger& lgr, progress& prog, log_proxy log(lgr); filesystem_parser parser(mm); - section_header sh; - size_t start; - std::vector meta_raw; - metadata_v2 meta; + section_map sections; - while (parser.next_section(sh, start, log)) { - if (sh.type == section_type::METADATA) { - // TODO: only decompress if needed - meta_raw = block_decompressor::decompress( - sh.compression, mm->as(start), sh.length); - // TODO: FIXME: - // meta = metadata_v2(lgr, meta_raw, nullptr); - break; - } else { + while (auto s = parser.next_section(log)) { + if (s->header.type == section_type::BLOCK) { ++prog.block_count; + } else { + if (!sections.emplace(s->header.type, *s).second) { + throw std::runtime_error("duplicate section: " + + get_section_name(s->header.type)); + } } } + std::vector schema_raw; + std::vector meta_raw; + auto meta = make_metadata(lgr, mm, sections, schema_raw, meta_raw); + struct ::statvfs stbuf; meta.statvfs(&stbuf); prog.original_size = stbuf.f_blocks * stbuf.f_frsize; parser.rewind(); - while (parser.next_section(sh, start, log)) { + while (auto s = parser.next_section(log)) { // TODO: multi-thread this? - switch (sh.type) { - case section_type::BLOCK: { + if (s->header.type == section_type::BLOCK) { auto block = block_decompressor::decompress( - sh.compression, mm->as(start), sh.length); + s->header.compression, mm->as(s->start), s->header.length); prog.filesystem_size += block.size(); writer.write_block(std::move(block)); - break; - } - - case section_type::METADATA: - writer.write_metadata(std::move(meta_raw)); - break; - - case section_type::METADATA_V2: - // TODO... - break; - - default: - throw std::runtime_error("unknown section"); } } + writer.write_metadata_v2_schema(std::move(schema_raw)); + writer.write_metadata_v2(std::move(meta_raw)); + writer.flush(); } @@ -401,31 +397,37 @@ void filesystem_v2::identify(logger& lgr, std::shared_ptr mm, log_proxy log(lgr); filesystem_parser parser(mm); - section_header sh; - size_t start; + section_map sections; - while (parser.next_section(sh, start, log)) { + while (auto s = parser.next_section(log)) { std::vector tmp; - block_decompressor bd(sh.compression, mm->as(start), sh.length, - tmp); - float compression_ratio = float(sh.length) / bd.uncompressed_size(); + block_decompressor bd(s->header.compression, mm->as(s->start), + s->header.length, tmp); + float compression_ratio = float(s->header.length) / bd.uncompressed_size(); - os << "SECTION " << sh.to_string() + os << "SECTION " << s->header.to_string() << ", blocksize=" << bd.uncompressed_size() - << ", ratio=" << fmt::format("{:.2%}%", compression_ratio) << std::endl; + << ", ratio=" << fmt::format("{:.2f}%", compression_ratio) << std::endl; - // TODO: do we need this? - // if (sh.type == section_type::METADATA_V2) { - // // TODO: only decompress if needed - // bd.decompress_frame(bd.uncompressed_size()); - // metadata_v2 meta(lgr, tmp, nullptr); - // struct ::statvfs stbuf; - // meta.statvfs(&stbuf); - // os << "block size: " << stbuf.f_bsize << std::endl; - // os << "inode count: " << stbuf.f_files << std::endl; - // os << "original filesystem size: " << stbuf.f_blocks << std::endl; - // } + if (s->header.type != section_type::BLOCK) { + if (!sections.emplace(s->header.type, *s).second) { + throw std::runtime_error("duplicate section: " + + get_section_name(s->header.type)); + } + } } + + std::vector schema_raw; + std::vector meta_raw; + + auto meta = make_metadata(lgr, mm, sections, schema_raw, meta_raw); + + struct ::statvfs stbuf; + meta.statvfs(&stbuf); + + os << "block size: " << stbuf.f_bsize << std::endl; + os << "inode count: " << stbuf.f_files << std::endl; + os << "original filesystem size: " << stbuf.f_blocks << std::endl; } } // namespace dwarfs diff --git a/src/dwarfs/fstypes.cpp b/src/dwarfs/fstypes.cpp index de8abff2..a779a533 100644 --- a/src/dwarfs/fstypes.cpp +++ b/src/dwarfs/fstypes.cpp @@ -45,6 +45,7 @@ const std::map sections{ #undef SECTION_TYPE_ }; +// TODO: remove const std::map dir_entries{ #define DIR_ENTRY_TYPE_(x) {dir_entry_type::x, #x} DIR_ENTRY_TYPE_(DIR_ENTRY), DIR_ENTRY_TYPE_(DIR_ENTRY_UG), @@ -76,6 +77,10 @@ std::string get_compression_name(compression_type type) { return get_default(compressions, type); } +std::string get_section_name(section_type type) { + return get_default(sections, type); +} + void section_header::dump(std::ostream& os) const { os << "type=" << get_default(sections, type) << ", compression=" << get_default(compressions, compression)