From f373144b73236b8ab2604213fb65f25009263dcc Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Thu, 26 Nov 2020 20:56:28 +0100 Subject: [PATCH] First take at metadata v2 --- .gitmodules | 3 + CMakeLists.txt | 101 +++++- fbthrift | 1 + include/dwarfs/entry.h | 89 ++++- include/dwarfs/filesystem.h | 3 + include/dwarfs/filesystem_writer.h | 9 + include/dwarfs/fstypes.h | 3 + include/dwarfs/inode.h | 6 + include/dwarfs/metadata_v2.h | 147 ++++++++ src/dwarfs/entry.cpp | 81 +++++ src/dwarfs/filesystem.cpp | 27 ++ src/dwarfs/filesystem_writer.cpp | 58 +++- src/dwarfs/fstypes.cpp | 3 +- src/dwarfs/inode_manager.cpp | 13 + src/dwarfs/metadata_v2.cpp | 537 +++++++++++++++++++++++++++++ src/dwarfs/scanner.cpp | 126 ++++++- src/dwarfs/worker_group.cpp | 2 +- src/dwarfsck.cpp | 5 +- src/mkdwarfs.cpp | 12 +- thrift/metadata.thrift | 67 ++++ 20 files changed, 1251 insertions(+), 42 deletions(-) create mode 160000 fbthrift create mode 100644 include/dwarfs/metadata_v2.h create mode 100644 src/dwarfs/metadata_v2.cpp create mode 100644 thrift/metadata.thrift diff --git a/.gitmodules b/.gitmodules index 248267b1..2556091d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "folly"] path = folly url = https://github.com/facebook/folly +[submodule "fbthrift"] + path = fbthrift + url = https://github.com/facebook/fbthrift/ diff --git a/CMakeLists.txt b/CMakeLists.txt index f7ab66e3..1d4058c2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,7 +49,12 @@ pkg_check_modules(LIBLZ4 IMPORTED_TARGET liblz4>=1.8.3) pkg_check_modules(LIBLZMA IMPORTED_TARGET liblzma>=5.2.4) pkg_check_modules(LIBZSTD IMPORTED_TARGET libzstd>=1.3.8) +set(compiler_only + ON + CACHE BOOL "only build thrift compiler") + add_subdirectory(folly EXCLUDE_FROM_ALL) +add_subdirectory(fbthrift EXCLUDE_FROM_ALL) if(WITH_TESTS) # Download and unpack googletest at configure time @@ -99,6 +104,7 @@ list( src/dwarfs/inode_reader.cpp src/dwarfs/logger.cpp src/dwarfs/metadata.cpp + src/dwarfs/metadata_v2.cpp src/dwarfs/metadata_writer.cpp src/dwarfs/mmap.cpp src/dwarfs/options.cpp @@ -132,11 +138,92 @@ if(WITH_TESTS) gtest_discover_tests(dwarfs_test) endif() +add_custom_command( + OUTPUT + ${CMAKE_CURRENT_BINARY_DIR}/thrift/dwarfs/gen-cpp2/metadata_constants.cpp + ${CMAKE_CURRENT_BINARY_DIR}/thrift/dwarfs/gen-cpp2/metadata_constants.h + ${CMAKE_CURRENT_BINARY_DIR}/thrift/dwarfs/gen-cpp2/metadata_data.cpp + ${CMAKE_CURRENT_BINARY_DIR}/thrift/dwarfs/gen-cpp2/metadata_data.h + ${CMAKE_CURRENT_BINARY_DIR}/thrift/dwarfs/gen-cpp2/metadata_for_each_field.h + ${CMAKE_CURRENT_BINARY_DIR}/thrift/dwarfs/gen-cpp2/metadata_layouts.cpp + ${CMAKE_CURRENT_BINARY_DIR}/thrift/dwarfs/gen-cpp2/metadata_layouts.h + ${CMAKE_CURRENT_BINARY_DIR}/thrift/dwarfs/gen-cpp2/metadata_metadata.cpp + ${CMAKE_CURRENT_BINARY_DIR}/thrift/dwarfs/gen-cpp2/metadata_metadata.h + ${CMAKE_CURRENT_BINARY_DIR}/thrift/dwarfs/gen-cpp2/metadata_types.cpp + ${CMAKE_CURRENT_BINARY_DIR}/thrift/dwarfs/gen-cpp2/metadata_types.h + ${CMAKE_CURRENT_BINARY_DIR}/thrift/dwarfs/gen-cpp2/metadata_types.tcc + ${CMAKE_CURRENT_BINARY_DIR}/thrift/dwarfs/gen-cpp2/metadata_types_custom_protocol.h + ${CMAKE_CURRENT_BINARY_DIR}/thrift/dwarfs/gen-cpp2/metadata_visit_union.h + ${CMAKE_CURRENT_BINARY_DIR}/thrift/dwarfs/gen-cpp2/metadata_visitation.h + ${CMAKE_CURRENT_BINARY_DIR}/thrift/lib/thrift/gen-cpp2/frozen_data.h + ${CMAKE_CURRENT_BINARY_DIR}/thrift/lib/thrift/gen-cpp2/frozen_data.cpp + ${CMAKE_CURRENT_BINARY_DIR}/thrift/lib/thrift/gen-cpp2/frozen_types.h + ${CMAKE_CURRENT_BINARY_DIR}/thrift/lib/thrift/gen-cpp2/frozen_types.tcc + ${CMAKE_CURRENT_BINARY_DIR}/thrift/lib/thrift/gen-cpp2/frozen_types.cpp + ${CMAKE_CURRENT_BINARY_DIR}/thrift/lib/thrift/gen-cpp2/frozen_types_custom_protocol.h + ${CMAKE_CURRENT_BINARY_DIR}/thrift/lib/thrift/gen-cpp2/frozen_constants.h + ${CMAKE_CURRENT_BINARY_DIR}/thrift/lib/thrift/gen-cpp2/frozen_constants.cpp + ${CMAKE_CURRENT_BINARY_DIR}/thrift/lib/thrift/gen-cpp2/frozen_metadata.h + ${CMAKE_CURRENT_BINARY_DIR}/thrift/lib/thrift/gen-cpp2/frozen_metadata.cpp + ${CMAKE_CURRENT_BINARY_DIR}/thrift/lib/thrift/gen-cpp2/frozen_visitation.h + ${CMAKE_CURRENT_BINARY_DIR}/thrift/lib/thrift/gen-cpp2/frozen_for_each_field.h + ${CMAKE_CURRENT_BINARY_DIR}/thrift/lib/thrift/gen-cpp2/frozen_visit_union.h + COMMAND mkdir -p ${CMAKE_CURRENT_BINARY_DIR}/thrift/lib/thrift + COMMAND + cp ${CMAKE_CURRENT_SOURCE_DIR}/fbthrift/thrift/lib/thrift/frozen.thrift + ${CMAKE_CURRENT_BINARY_DIR}/thrift/lib/thrift/ + COMMAND mkdir -p ${CMAKE_CURRENT_BINARY_DIR}/thrift/dwarfs + COMMAND cp ${CMAKE_CURRENT_SOURCE_DIR}/thrift/metadata.thrift + thrift/dwarfs/metadata.thrift + COMMAND + cd ${CMAKE_CURRENT_BINARY_DIR}/thrift/dwarfs && + ${CMAKE_CURRENT_BINARY_DIR}/bin/thrift1 --gen mstch_cpp2:frozen2 + metadata.thrift + COMMAND cd ${CMAKE_CURRENT_BINARY_DIR}/thrift/lib/thrift && + ${CMAKE_CURRENT_BINARY_DIR}/bin/thrift1 --gen mstch_cpp2 frozen.thrift + DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/bin/thrift1 + ${CMAKE_CURRENT_SOURCE_DIR}/thrift/metadata.thrift) + +list( + APPEND + INCLUDE_DIRS + ${CMAKE_CURRENT_BINARY_DIR}/folly + ${CMAKE_CURRENT_BINARY_DIR}/thrift + ${CMAKE_CURRENT_SOURCE_DIR}/folly + ${CMAKE_CURRENT_SOURCE_DIR}/fbthrift + ${CMAKE_CURRENT_BINARY_DIR}) + +add_library( + thrift_light + ${CMAKE_CURRENT_SOURCE_DIR}/fbthrift/thrift/lib/cpp2/protocol/CompactProtocol.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/fbthrift/thrift/lib/cpp2/protocol/BinaryProtocol.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/fbthrift/thrift/lib/cpp2/protocol/DebugProtocol.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/fbthrift/thrift/lib/cpp2/protocol/JSONProtocolCommon.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/fbthrift/thrift/lib/cpp/protocol/TProtocolException.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/fbthrift/thrift/lib/cpp/util/VarintUtils.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/fbthrift/thrift/lib/cpp2/gen/module_types_cpp.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/fbthrift/thrift/lib/cpp2/frozen/Frozen.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/fbthrift/thrift/lib/cpp2/frozen/FrozenUtil.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/fbthrift/thrift/lib/cpp2/frozen/schema/MemorySchema.cpp + ${CMAKE_CURRENT_BINARY_DIR}/thrift/lib/thrift/gen-cpp2/frozen_types.cpp) + +set_property(TARGET thrift_light PROPERTY CXX_STANDARD 17) + +target_include_directories(thrift_light PRIVATE ${INCLUDE_DIRS}) + +add_library( + metadata_thrift + ${CMAKE_CURRENT_BINARY_DIR}/thrift/dwarfs/gen-cpp2/metadata_layouts.cpp + ${CMAKE_CURRENT_BINARY_DIR}/thrift/dwarfs/gen-cpp2/metadata_types.cpp + ${CMAKE_CURRENT_BINARY_DIR}/thrift/dwarfs/gen-cpp2/metadata_data.cpp) + +set_property(TARGET metadata_thrift PROPERTY CXX_STANDARD 17) + +target_include_directories(metadata_thrift PRIVATE ${INCLUDE_DIRS}) + foreach(tgt dwarfs ${BINARY_TARGETS}) - target_include_directories( - ${tgt} SYSTEM - PRIVATE ${Boost_INCLUDE_DIRS} ${CMAKE_CURRENT_BINARY_DIR}/folly - ${CMAKE_CURRENT_SOURCE_DIR}/folly) + target_include_directories(${tgt} SYSTEM PRIVATE ${Boost_INCLUDE_DIRS} + ${INCLUDE_DIRS}) target_include_directories(${tgt} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include) @@ -150,9 +237,11 @@ foreach(tgt dwarfs ${BINARY_TARGETS}) target_compile_options(${tgt} PRIVATE -Wall -Wextra -pedantic) - set_property(TARGET ${tgt} PROPERTY CXX_STANDARD 20) + set_property(TARGET ${tgt} PROPERTY CXX_STANDARD 17) set_property(TARGET ${tgt} PROPERTY CXX_STANDARD_REQUIRED ON) set_property(TARGET ${tgt} PROPERTY CXX_EXTENSIONS OFF) + + add_dependencies(${tgt} metadata_thrift) endforeach() target_compile_definitions(dwarfs-bin PRIVATE FUSE_USE_VERSION=35 @@ -162,6 +251,8 @@ foreach(tgt ${BINARY_TARGETS}) target_link_libraries( ${tgt} dwarfs + metadata_thrift + thrift_light folly ${Boost_LIBRARIES} PkgConfig::LIBLZ4 diff --git a/fbthrift b/fbthrift new file mode 160000 index 00000000..42536064 --- /dev/null +++ b/fbthrift @@ -0,0 +1 @@ +Subproject commit 42536064c10726c50ce07a0ffd0910c17d8781da diff --git a/include/dwarfs/entry.h b/include/dwarfs/entry.h index fa9245e4..faa2b292 100644 --- a/include/dwarfs/entry.h +++ b/include/dwarfs/entry.h @@ -23,17 +23,95 @@ #include #include +#include #include #include +#include #include #include -#include "file_interface.h" -#include "fstypes.h" +#include "dwarfs/file_interface.h" +#include "dwarfs/fstypes.h" + +#include "dwarfs/gen-cpp2/metadata_types.h" namespace dwarfs { +struct global_entry_data { + void add_uid(uint16_t uid) { add(uid, uids, next_uid_index); } + + void add_gid(uint16_t gid) { add(gid, gids, next_gid_index); } + + void add_mode(uint16_t mode) { add(mode, modes, next_mode_index); } + + void add(uint16_t val, std::unordered_map& map, + uint16_t& next_index) { + if (map.emplace(val, next_index).second) { + ++next_index; + } + } + + void add_time(uint64_t time) { + if (time < timestamp_base) { + timestamp_base = time; + } + } + + void add_name(std::string const& name) { names.emplace(name, 0); } + + void add_link(std::string const& link) { links.emplace(link, 0); } + + void index() { + index(names); + index(links); + } + + void index(std::unordered_map& map); + + uint16_t get_uid_index(uint16_t uid) const { return uids.at(uid); } + + uint16_t get_gid_index(uint16_t gid) const { return gids.at(gid); } + + uint16_t get_mode_index(uint16_t mode) const { return modes.at(mode); } + + uint32_t get_name_index(std::string const& name) const { + return names.at(name); + } + + uint32_t get_link_index(std::string const& link) const { + return links.at(link); + } + + uint64_t get_time_offset(uint64_t time) const { + return time - timestamp_base; + } + + std::vector get_uids() const; + + std::vector get_gids() const; + + std::vector get_modes() const; + + std::vector get_names() const; + + std::vector get_links() const; + + // TODO: make private + template + std::vector get_vector(std::unordered_map const& map) const; + + std::unordered_map uids; + std::unordered_map gids; + std::unordered_map modes; + std::unordered_map names; + std::unordered_map links; + uint16_t next_uid_index{0}; + uint16_t next_gid_index{0}; + uint16_t next_mode_index{0}; + uint64_t timestamp_base{std::numeric_limits::max()}; +}; + class file; class link; class dir; @@ -72,6 +150,9 @@ class entry : public file_interface { void pack(dir_entry& de) const; void pack(dir_entry_ug& de) const; void pack(dir_entry_ug_time& de) const; + void + pack(thrift::metadata::entry& entry_v2, global_entry_data const& data) const; + void update(global_entry_data& data) const; virtual void accept(entry_visitor& v, bool preorder = false) = 0; virtual uint32_t inode_num() const = 0; @@ -130,8 +211,12 @@ class dir : public entry { pack(uint8_t* buf, std::function const& offset_cb) const = 0; + virtual void pack(thrift::metadata::metadata& mv2, + global_entry_data const& data) const = 0; virtual size_t packed_entry_size() const = 0; virtual void pack_entry(uint8_t* buf) const = 0; + virtual void pack_entry(thrift::metadata::metadata& mv2, + global_entry_data const& data) const = 0; uint32_t inode_num() const override { return inode_; } protected: diff --git a/include/dwarfs/filesystem.h b/include/dwarfs/filesystem.h index ddc2326f..32992725 100644 --- a/include/dwarfs/filesystem.h +++ b/include/dwarfs/filesystem.h @@ -86,6 +86,8 @@ class filesystem { void dump(std::ostream& os) const { impl_->dump(os); } + void dump_v2(std::ostream& os) const { impl_->dump_v2(os); } + void walk(std::function const& func) { impl_->walk(func); } @@ -143,6 +145,7 @@ class filesystem { virtual ~impl() = default; virtual void dump(std::ostream& os) const = 0; + virtual void dump_v2(std::ostream& os) const = 0; virtual void walk(std::function const& func) const = 0; virtual const dir_entry* find(const char* path) const = 0; diff --git a/include/dwarfs/filesystem_writer.h b/include/dwarfs/filesystem_writer.h index e72f4e36..f6123c7f 100644 --- a/include/dwarfs/filesystem_writer.h +++ b/include/dwarfs/filesystem_writer.h @@ -54,6 +54,10 @@ class filesystem_writer { progress& prog, const block_compressor& bc, size_t max_queue_size); + filesystem_writer(std::ostream& os, logger& lgr, worker_group& wg, + progress& prog, const block_compressor& bc, + const block_compressor& metadata_bc, size_t max_queue_size); + // section create_block(); // section create_metadata(); @@ -67,6 +71,10 @@ class filesystem_writer { impl_->write_metadata(std::move(data)); } + void write_metadata_v2(std::vector&& data) { + impl_->write_metadata_v2(std::move(data)); + } + void flush() { impl_->flush(); } size_t size() const { return impl_->size(); } @@ -77,6 +85,7 @@ class filesystem_writer { virtual void write_block(std::vector&& data) = 0; virtual void write_metadata(std::vector&& data) = 0; + virtual void write_metadata_v2(std::vector&& data) = 0; virtual void flush() = 0; virtual size_t size() const = 0; }; diff --git a/include/dwarfs/fstypes.h b/include/dwarfs/fstypes.h index af30a895..eebc9d03 100644 --- a/include/dwarfs/fstypes.h +++ b/include/dwarfs/fstypes.h @@ -106,6 +106,9 @@ enum class section_type : uint16_t { // the block size which is needed for working with the // chunk lists. Also defines inode offsets being used // and the total inode count (for out-of-bounds checks). + + METADATA_V2 = 7, + // Frozen metadata. }; enum class dir_entry_type : uint8_t { diff --git a/include/dwarfs/inode.h b/include/dwarfs/inode.h index 2991d239..36070ae8 100644 --- a/include/dwarfs/inode.h +++ b/include/dwarfs/inode.h @@ -26,6 +26,10 @@ namespace dwarfs { +namespace thrift::metadata { +struct chunk; +} + class file; class file_interface; @@ -38,5 +42,7 @@ class inode : public file_interface { virtual const file_interface* any() const = 0; // TODO virtual void add_chunk(size_t block, size_t offset, size_t size) = 0; virtual const std::vector& chunks() const = 0; + virtual void + append_chunks(std::vector& vec) const = 0; }; } // namespace dwarfs diff --git a/include/dwarfs/metadata_v2.h b/include/dwarfs/metadata_v2.h new file mode 100644 index 00000000..d8aaa8b4 --- /dev/null +++ b/include/dwarfs/metadata_v2.h @@ -0,0 +1,147 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +#include "fstypes.h" +#include "logger.h" + +namespace dwarfs { + +class metadata_v2 { + public: + metadata_v2() = default; + + metadata_v2(logger& lgr, std::vector&& data, + const struct ::stat* defaults); + + metadata_v2& operator=(metadata_v2&&) = default; + + void + dump(std::ostream& os, + std::function const& icb) const { + impl_->dump(os, icb); + } + +#if 0 + static void get_stat_defaults(struct ::stat* defaults); + + size_t size() const { return impl_->size(); } + + bool empty() const { return !impl_ || impl_->empty(); } + + size_t block_size() const { return impl_->block_size(); } + + unsigned block_size_bits() const { return impl_->block_size_bits(); } + + void walk(std::function const& func) const { + impl_->walk(func); + } + + const dir_entry* find(const char* path) const { return impl_->find(path); } + + const dir_entry* find(int inode) const { return impl_->find(inode); } + + const dir_entry* find(int inode, const char* name) const { + return impl_->find(inode, name); + } + + int getattr(const dir_entry* de, struct ::stat* stbuf) const { + return impl_->getattr(de, stbuf); + } + + int access(const dir_entry* de, int mode, uid_t uid, gid_t gid) const { + return impl_->access(de, mode, uid, gid); + } + + const directory* opendir(const dir_entry* de) const { + return impl_->opendir(de); + } + + const dir_entry* + readdir(const directory* d, size_t offset, std::string* name) const { + return impl_->readdir(d, offset, name); + } + + size_t dirsize(const directory* d) const { return impl_->dirsize(d); } + + int readlink(const dir_entry* de, char* buf, size_t size) const { + return impl_->readlink(de, buf, size); + } + + int readlink(const dir_entry* de, std::string* buf) const { + return impl_->readlink(de, buf); + } + + int statvfs(struct ::statvfs* stbuf) const { return impl_->statvfs(stbuf); } + + int open(const dir_entry* de) const { return impl_->open(de); } + + const chunk_type* get_chunks(int inode, size_t& num) const { + return impl_->get_chunks(inode, num); + } +#endif + + class impl { + public: + virtual ~impl() = default; + + virtual void dump( + std::ostream& os, + std::function const& icb) const = 0; +#if 0 + virtual size_t size() const = 0; + virtual bool empty() const = 0; + virtual size_t block_size() const = 0; + virtual unsigned block_size_bits() const = 0; + virtual void + walk(std::function const& func) const = 0; + virtual const dir_entry* find(const char* path) const = 0; + virtual const dir_entry* find(int inode) const = 0; + virtual const dir_entry* find(int inode, const char* name) const = 0; + virtual int getattr(const dir_entry* de, struct ::stat* stbuf) const = 0; + virtual int + access(const dir_entry* de, int mode, uid_t uid, gid_t gid) const = 0; + virtual const directory* opendir(const dir_entry* de) const = 0; + virtual const dir_entry* + readdir(const directory* d, size_t offset, std::string* name) const = 0; + virtual size_t dirsize(const directory* d) const = 0; + virtual int readlink(const dir_entry* de, char* buf, size_t size) const = 0; + virtual int readlink(const dir_entry* de, std::string* buf) const = 0; + virtual int statvfs(struct ::statvfs* stbuf) const = 0; + virtual int open(const dir_entry* de) const = 0; + virtual const chunk_type* get_chunks(int inode, size_t& num) const = 0; +#endif + }; + + private: + std::unique_ptr impl_; +}; +} // namespace dwarfs diff --git a/src/dwarfs/entry.cpp b/src/dwarfs/entry.cpp index 303be74a..e3a253ae 100644 --- a/src/dwarfs/entry.cpp +++ b/src/dwarfs/entry.cpp @@ -28,6 +28,7 @@ #include #include +#include #include @@ -39,6 +40,41 @@ namespace dwarfs { +template +std::vector +global_entry_data::get_vector(std::unordered_map const& map) const { + using namespace folly::gen; + std::vector> pairs(map.begin(), map.end()); + return from(pairs) | orderBy([](auto const& p) { return p.second; }) | + get<0>() | as(); +} + +std::vector global_entry_data::get_uids() const { + return get_vector(uids); +} + +std::vector global_entry_data::get_gids() const { + return get_vector(gids); +} + +std::vector global_entry_data::get_modes() const { + return get_vector(modes); +} + +std::vector global_entry_data::get_names() const { + return get_vector(names); +} + +std::vector global_entry_data::get_links() const { + return get_vector(links); +} + +void global_entry_data::index(std::unordered_map& map) { + using namespace folly::gen; + uint32_t ix = 0; + from(map) | get<0>() | order | [&](std::string const& s) { map[s] = ix++; }; +} + template class dir_ : public dir { public: @@ -51,6 +87,13 @@ class dir_ : public dir { entry::pack(*de); } + void pack_entry(thrift::metadata::metadata& mv2, + global_entry_data const& data) const override { + mv2.inode_index.at(inode_num()) = mv2.entries.size(); + mv2.entries.emplace_back(); + entry::pack(mv2.entries.back(), data); + } + size_t packed_size() const override { return offsetof(directory, u) + sizeof(DirEntryType) * entries_.size(); } @@ -74,6 +117,23 @@ class dir_ : public dir { ++de; } } + + void pack(thrift::metadata::metadata& mv2, + global_entry_data const& data) const override { + thrift::metadata::directory dir; + dir.self_inode = inode_num(); + dir.parent_inode = + has_parent() ? std::dynamic_pointer_cast(parent())->inode_num() + : 0; + dir.first_entry = mv2.entries.size(); + dir.entry_count = entries_.size(); + mv2.directories.push_back(dir); + for (entry_ptr const& e : entries_) { + mv2.inode_index.at(e->inode_num()) = mv2.entries.size(); + mv2.entries.emplace_back(); + e->pack(mv2.entries.back(), data); + } + } }; entry::entry(const std::string& name, std::shared_ptr parent, @@ -156,6 +216,27 @@ void entry::pack(dir_entry_ug_time& de) const { pack(de.ug); } +void entry::update(global_entry_data& data) const { + data.add_uid(stat_.st_uid); + data.add_gid(stat_.st_gid); + data.add_mode(stat_.st_mode & 0xFFFF); + data.add_time(stat_.st_atime); + data.add_time(stat_.st_mtime); + data.add_time(stat_.st_ctime); +} + +void entry::pack(thrift::metadata::entry& entry_v2, + global_entry_data const& data) const { + entry_v2.name_index = has_parent() ? data.get_name_index(name_) : 0; + entry_v2.mode = data.get_mode_index(stat_.st_mode & 0xFFFF); + entry_v2.owner = data.get_uid_index(stat_.st_uid); + entry_v2.group = data.get_gid_index(stat_.st_gid); + entry_v2.atime = data.get_time_offset(stat_.st_atime); + entry_v2.mtime = data.get_time_offset(stat_.st_mtime); + entry_v2.ctime = data.get_time_offset(stat_.st_ctime); + entry_v2.inode = inode_num(); +} + entry::type_t file::type() const { return E_FILE; } std::string_view file::hash() const { diff --git a/src/dwarfs/filesystem.cpp b/src/dwarfs/filesystem.cpp index 52a1f8d0..811bb35e 100644 --- a/src/dwarfs/filesystem.cpp +++ b/src/dwarfs/filesystem.cpp @@ -31,6 +31,7 @@ #include "dwarfs/fstypes.h" #include "dwarfs/inode_reader.h" #include "dwarfs/metadata.h" +#include "dwarfs/metadata_v2.h" #include "dwarfs/progress.h" namespace dwarfs { @@ -102,6 +103,7 @@ class filesystem_ : public filesystem::impl { const struct ::stat* stat_defaults, int inode_offset); void dump(std::ostream& os) const override; + void dump_v2(std::ostream& os) const override; void walk(std::function const& func) const override; const dir_entry* find(const char* path) const override; const dir_entry* find(int inode) const override; @@ -126,6 +128,7 @@ class filesystem_ : public filesystem::impl { log_proxy log_; std::shared_ptr mm_; metadata meta_; + metadata_v2 meta_v2_; inode_reader ir_; }; @@ -156,6 +159,15 @@ filesystem_::filesystem_(logger& lgr, std::shared_ptr mm, stat_defaults, inode_offset); break; + case section_type::METADATA_V2: + // TODO: handle in-place uncompressed metadata + meta_v2_ = + metadata_v2(lgr, + block_decompressor::decompress( + sh.compression, mm_->as(start), sh.length), + stat_defaults); + break; + default: throw std::runtime_error("unknown section"); } @@ -184,6 +196,17 @@ void filesystem_::dump(std::ostream& os) const { }); } +template +void filesystem_::dump_v2(std::ostream& os) const { + meta_v2_.dump(os, [&](const std::string& indent, uint32_t inode) { + size_t num = 0; + const chunk_type* chunk = meta_.get_chunks(inode, num); // TODO + + os << indent << num << " chunks in inode " << inode << "\n"; + ir_.dump(os, indent + " ", chunk, num); + }); +} + template void filesystem_::walk( std::function const& func) const { @@ -325,6 +348,10 @@ void filesystem::rewrite(logger& lgr, progress& prog, std::shared_ptr mm, writer.write_metadata(std::move(meta_raw)); break; + case section_type::METADATA_V2: + // TODO... + break; + default: throw std::runtime_error("unknown section"); } diff --git a/src/dwarfs/filesystem_writer.cpp b/src/dwarfs/filesystem_writer.cpp index afff0f05..7eea3ad9 100644 --- a/src/dwarfs/filesystem_writer.cpp +++ b/src/dwarfs/filesystem_writer.cpp @@ -86,20 +86,22 @@ class fsblock { }; public: - fsblock(section_type type, std::vector&& data) + fsblock(section_type type, const block_compressor& bc, + std::vector&& data) : type_(type) + , bc_(bc) , uncompressed_size_(data.size()) , state_(std::make_shared(std::move(data))) {} template - void compress(worker_group& wg, const block_compressor& bc, LogProxy& lp) { + void compress(worker_group& wg, LogProxy& lp) { lp.trace() << "block queued for compression"; std::shared_ptr s = state_; - wg.add_job([&, bc, s] { + wg.add_job([&, s] { lp.trace() << "block compression started"; - s->compress(bc, lp); + s->compress(bc_, lp); }); } @@ -107,6 +109,8 @@ class fsblock { section_type type() const { return type_; } + compression_type compression() const { return bc_.type(); } + const std::vector& data() const { return state_->data(); ; @@ -118,6 +122,7 @@ class fsblock { private: const section_type type_; + block_compressor const& bc_; const size_t uncompressed_size_; std::shared_ptr state_; }; @@ -127,17 +132,21 @@ class filesystem_writer_ : public filesystem_writer::impl { public: filesystem_writer_(logger& lgr, std::ostream& os, worker_group& wg, progress& prog, const block_compressor& bc, + const block_compressor& metadata_bc, size_t max_queue_size); ~filesystem_writer_() noexcept; void write_block(std::vector&& data) override; void write_metadata(std::vector&& data) override; + void write_metadata_v2(std::vector&& data) override; void flush() override; size_t size() const override { return os_.tellp(); } private: - void write_section(section_type type, std::vector&& data); - void write(section_type type, const std::vector& data); + void write_section(section_type type, std::vector&& data, + block_compressor const& bc); + void write(section_type type, compression_type compression, + const std::vector& data); void write(const char* data, size_t size); template void write(const T& obj); @@ -150,6 +159,7 @@ class filesystem_writer_ : public filesystem_writer::impl { worker_group& wg_; progress& prog_; const block_compressor& bc_; + const block_compressor& metadata_bc_; const size_t max_queue_size_; log_proxy log_; std::deque> queue_; @@ -162,11 +172,13 @@ class filesystem_writer_ : public filesystem_writer::impl { template filesystem_writer_::filesystem_writer_( logger& lgr, std::ostream& os, worker_group& wg, progress& prog, - const block_compressor& bc, size_t max_queue_size) + const block_compressor& bc, const block_compressor& metadata_bc, + size_t max_queue_size) : os_(os) , wg_(wg) , prog_(prog) , bc_(bc) + , metadata_bc_(metadata_bc) , max_queue_size_(max_queue_size) , log_(lgr) , flush_(false) @@ -219,7 +231,7 @@ void filesystem_writer_::writer_thread() { << size_with_unit(fsb->uncompressed_size()) << " to " << size_with_unit(fsb->size()); - write(fsb->type(), fsb->data()); + write(fsb->type(), fsb->compression(), fsb->data()); } } @@ -263,10 +275,11 @@ void filesystem_writer_::write_file_header() { template void filesystem_writer_::write(section_type type, + compression_type compression, const std::vector& data) { section_header sh; sh.type = type; - sh.compression = bc_.type(); + sh.compression = compression; sh.unused = 0; sh.length = data.size(); write(sh); @@ -279,7 +292,8 @@ void filesystem_writer_::write(section_type type, template void filesystem_writer_::write_section( - section_type type, std::vector&& data) { + section_type type, std::vector&& data, + block_compressor const& bc) { { std::unique_lock lock(mx_); @@ -288,9 +302,9 @@ void filesystem_writer_::write_section( } } - auto fsb = std::make_unique(type, std::move(data)); + auto fsb = std::make_unique(type, bc, std::move(data)); - fsb->compress(wg_, bc_, log_); + fsb->compress(wg_, log_); { std::lock_guard lock(mx_); @@ -303,13 +317,19 @@ void filesystem_writer_::write_section( template void filesystem_writer_::write_block( std::vector&& data) { - write_section(section_type::BLOCK, std::move(data)); + write_section(section_type::BLOCK, std::move(data), bc_); } template void filesystem_writer_::write_metadata( std::vector&& data) { - write_section(section_type::METADATA, std::move(data)); + write_section(section_type::METADATA, std::move(data), metadata_bc_); +} + +template +void filesystem_writer_::write_metadata_v2( + std::vector&& data) { + write_section(section_type::METADATA_V2, std::move(data), metadata_bc_); } template @@ -333,7 +353,15 @@ filesystem_writer::filesystem_writer(std::ostream& os, logger& lgr, worker_group& wg, progress& prog, const block_compressor& bc, size_t max_queue_size) + : filesystem_writer(os, lgr, wg, prog, bc, bc, max_queue_size) {} + +filesystem_writer::filesystem_writer(std::ostream& os, logger& lgr, + worker_group& wg, progress& prog, + const block_compressor& bc, + const block_compressor& metadata_bc, + size_t max_queue_size) : impl_( make_unique_logging_object( - lgr, os, wg, prog, bc, max_queue_size)) {} + lgr, os, wg, prog, bc, metadata_bc, max_queue_size)) {} + } // namespace dwarfs diff --git a/src/dwarfs/fstypes.cpp b/src/dwarfs/fstypes.cpp index 541debaa..d04e14de 100644 --- a/src/dwarfs/fstypes.cpp +++ b/src/dwarfs/fstypes.cpp @@ -39,7 +39,8 @@ const std::map sections{ SECTION_TYPE_(META_INODE_INDEX), SECTION_TYPE_(META_CHUNK_INDEX), SECTION_TYPE_(META_DIRECTORIES), - SECTION_TYPE_(META_CONFIG) + SECTION_TYPE_(META_CONFIG), + SECTION_TYPE_(METADATA_V2), #undef SECTION_TYPE_ }; diff --git a/src/dwarfs/inode_manager.cpp b/src/dwarfs/inode_manager.cpp index 642f5110..cf665aae 100644 --- a/src/dwarfs/inode_manager.cpp +++ b/src/dwarfs/inode_manager.cpp @@ -28,6 +28,8 @@ #include "dwarfs/inode_manager.h" #include "dwarfs/script.h" +#include "dwarfs/gen-cpp2/metadata_types.h" + namespace dwarfs { template @@ -76,6 +78,17 @@ class inode_manager_ : public inode_manager { const std::vector& chunks() const override { return chunks_; } + void + append_chunks(std::vector& vec) const override { + for (auto c : chunks_) { + thrift::metadata::chunk chnk; + chnk.block = access::block(c); + chnk.offset = access::offset(c); + chnk.size = access::size(c); + vec.push_back(chnk); + } + } + private: uint32_t num_{std::numeric_limits::max()}; file const* file_{nullptr}; diff --git a/src/dwarfs/metadata_v2.cpp b/src/dwarfs/metadata_v2.cpp new file mode 100644 index 00000000..82a4148e --- /dev/null +++ b/src/dwarfs/metadata_v2.cpp @@ -0,0 +1,537 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#include + +#include + +#include + +#include "dwarfs/metadata_v2.h" + +#include "dwarfs/gen-cpp2/metadata_layouts.h" +#include "dwarfs/gen-cpp2/metadata_types.h" +#include "dwarfs/gen-cpp2/metadata_types_custom_protocol.h" +#include +#include +#include + +namespace dwarfs { + +// TODO: merge this into the metadata implementation behind interface + +template +class metadata_v2_ : public metadata_v2::impl { + public: + template + using view = typename ::apache::thrift::frozen::View; + using entry_view = view; + using directory_view = view; + + metadata_v2_(logger& lgr, std::vector&& meta, + const struct ::stat* /*defaults*/) + : data_(std::move(meta)) + , meta_(::apache::thrift::frozen::mapFrozen( + data_)) + , root_(meta_.entries()[meta_.inode_index()[0]]) + , inode_offset_(meta_.chunk_index_offset()) + , log_(lgr) { + // TODO: defaults? + log_.debug() << ::apache::thrift::debugString(meta_.thaw()); + + ::apache::thrift::frozen::Layout layout; + ::apache::thrift::frozen::schema::Schema schema; + folly::ByteRange range(data_); + apache::thrift::CompactSerializer::deserialize(range, schema); + log_.debug() << ::apache::thrift::debugString(schema); + } + + void dump(std::ostream& os, + std::function const& icb) + const override; + +#if 0 + size_t size() const override { return data_.size(); } + + bool empty() const override { return data_.empty(); } + + size_t block_size() const override { + return static_cast(1) << cfg_->block_size_bits; + } + + unsigned block_size_bits() const override { return cfg_->block_size_bits; } + + void walk(std::function const& func) const override; + const dir_entry* find(const char* path) const override; + const dir_entry* find(int inode) const override; + const dir_entry* find(int inode, const char* name) const override; + int getattr(const dir_entry* de, struct ::stat* stbuf) const override; + int access(const dir_entry* de, int mode, uid_t uid, + gid_t gid) const override; + const directory* opendir(const dir_entry* de) const override; + const dir_entry* + readdir(const directory* d, size_t offset, std::string* name) const override; + size_t dirsize(const directory* d) const override { + return d->count + 2; // adds '.' and '..', which we fake in ;-) + } + int readlink(const dir_entry* de, char* buf, size_t size) const override; + int readlink(const dir_entry* de, std::string* buf) const override; + int statvfs(struct ::statvfs* stbuf) const override; + int open(const dir_entry* de) const override; + + const chunk_type* get_chunks(int inode, size_t& num) const override; +#endif + + private: + void dump(std::ostream& os, const std::string& indent, entry_view entry, + std::function const& icb) const; + void dump(std::ostream& os, const std::string& indent, directory_view dir, + std::function const& icb) const; + + std::string modestring(uint16_t mode) const; + + size_t reg_filesize(uint32_t inode) const { + uint32_t cur = meta_.chunk_index()[inode]; + uint32_t end = meta_.chunk_index()[inode + 1]; + size_t size = 0; + while (cur < end) { + size += meta_.chunks()[cur++].size(); + } + return size; + } + + size_t filesize(entry_view entry, uint16_t mode) const { + if (S_ISREG(mode)) { + return reg_filesize(entry.inode()); + } else if (S_ISLNK(mode)) { + return meta_.links()[meta_.dir_link_index()[entry.inode()]].size(); + } else { + return 0; + } + } + +#if 0 + void walk(const dir_entry* de, + std::function const& func) const; + + std::string name(const dir_entry* de) const { + return std::string(as(de->name_offset), de->name_size); + } + + size_t linksize(const dir_entry* de) const { + return *as(de->u.offset); + } + + std::string linkname(const dir_entry* de) const { + size_t offs = de->u.offset; + return std::string(as(offs + sizeof(uint16_t)), *as(offs)); + } + + const char* linkptr(const dir_entry* de) const { + return as(de->u.offset + sizeof(uint16_t)); + } + + const directory* getdir(const dir_entry* de) const { + return as(de->u.offset); + } + + template + const T* as(size_t offset = 0) const { + return reinterpret_cast( + reinterpret_cast(data_.data()) + offset); + } + + const dir_entry* get_entry(int inode) const { + inode -= inode_offset_; + return inode >= 0 && inode < static_cast(cfg_->inode_count) + ? as(inode_index_[inode]) + : nullptr; + } + + void parse(const struct ::stat* defaults); + + const uint32_t* chunk_index_ = nullptr; + const uint32_t* inode_index_ = nullptr; + const dir_entry* root_ = nullptr; + const meta_config* cfg_ = nullptr; + std::shared_ptr dir_reader_; +#endif + std::vector data_; + ::apache::thrift::frozen::MappedFrozen meta_; + entry_view root_; + const int inode_offset_; + log_proxy log_; +}; + +template +void metadata_v2_::dump( + std::ostream& os, const std::string& indent, entry_view entry, + std::function const& icb) const { + auto mode = meta_.modes()[entry.mode()]; + auto inode = entry.inode(); + + os << indent << " " << modestring(mode); + + if (inode > 0) { + os << " " << meta_.names()[entry.name_index()]; + } + + if (S_ISREG(mode)) { + uint32_t cur = meta_.chunk_index()[inode - inode_offset_]; + uint32_t end = meta_.chunk_index()[inode - inode_offset_ + 1]; + os << " [" << cur << ", " << end << "]"; + size_t size = 0; + while (cur < end) { + size += meta_.chunks()[cur++].size(); + } + os << " " << size << "\n"; + // os << " " << filesize(entry, mode) << "\n"; + // icb(indent + " ", de->inode); + } else if (S_ISDIR(mode)) { + auto dir_index = meta_.dir_link_index()[inode]; + os << " => " + << "" + << "\n"; + dump(os, indent + " ", meta_.directories()[dir_index], std::move(icb)); + } else if (S_ISLNK(mode)) { + os << " -> " << meta_.links()[meta_.dir_link_index()[inode]] << "\n"; + } else { + os << " (unknown type)\n"; + } +} + +template +void metadata_v2_::dump( + std::ostream& os, const std::string& indent, directory_view dir, + std::function const& icb) const { + auto count = dir.entry_count(); + auto first = dir.first_entry(); + os << indent << "(" << count << ") entries\n"; + + for (size_t i = 0; i < count; ++i) { + dump(os, indent, meta_.entries()[first + i], icb); + } +} + +template +void metadata_v2_::dump( + std::ostream& os, + std::function const& icb) const { + dump(os, "", root_, icb); +} + +template +std::string metadata_v2_::modestring(uint16_t mode) const { + std::ostringstream oss; + + oss << (mode & S_ISUID ? 'U' : '-'); + oss << (mode & S_ISGID ? 'G' : '-'); + oss << (mode & S_ISVTX ? 'S' : '-'); + oss << (S_ISDIR(mode) ? 'd' : S_ISLNK(mode) ? 'l' : '-'); + oss << (mode & S_IRUSR ? 'r' : '-'); + oss << (mode & S_IWUSR ? 'w' : '-'); + oss << (mode & S_IXUSR ? 'x' : '-'); + oss << (mode & S_IRGRP ? 'r' : '-'); + oss << (mode & S_IWGRP ? 'w' : '-'); + oss << (mode & S_IXGRP ? 'x' : '-'); + oss << (mode & S_IROTH ? 'r' : '-'); + oss << (mode & S_IWOTH ? 'w' : '-'); + oss << (mode & S_IXOTH ? 'x' : '-'); + + return oss.str(); +} + +#if 0 +template +void metadata_::parse(const struct ::stat* defaults) { + size_t offset = 0; + + while (offset + sizeof(section_header) <= size()) { + const section_header* sh = as(offset); + + log_.debug() << "section_header@" << offset << " (" << sh->to_string() + << ")"; + + offset += sizeof(section_header); + + if (offset + sh->length > size()) { + throw std::runtime_error("truncated metadata"); + } + + if (sh->compression != compression_type::NONE) { + throw std::runtime_error("unsupported metadata compression type"); + } + + switch (sh->type) { + case section_type::META_TABLEDATA: + case section_type::META_DIRECTORIES: + // ok, ignore + break; + + case section_type::META_CHUNK_INDEX: + chunk_index_ = as(offset); + break; + + case section_type::META_INODE_INDEX: + inode_index_ = as(offset); + break; + + case section_type::META_CONFIG: + cfg_ = as(offset); + break; + + default: + throw std::runtime_error("unknown metadata section"); + } + + offset += sh->length; + } + + // TODO: moar checkz + + if (!cfg_) { + throw std::runtime_error("no metadata configuration found"); + } + + struct ::stat stat_defaults; + + if (defaults) { + stat_defaults = *defaults; + } else { + metadata::get_stat_defaults(&stat_defaults); + } + + chunk_index_ -= cfg_->chunk_index_offset; + inode_index_ -= cfg_->inode_index_offset; + + root_ = as(inode_index_[0]); + + dir_reader_ = dir_reader::create(cfg_->de_type, stat_defaults, + reinterpret_cast(data_.data()), + inode_offset_); +} + +template +void metadata_::walk( + const dir_entry* de, + std::function const& func) const { + func(de); + if (S_ISDIR(de->mode)) { + auto dir = getdir(de); + for (size_t i = 0; i < dir->count; ++i) { + walk(dir_reader_->readdir(dir, i), func); + } + } +} + +template +void metadata_::walk( + std::function const& func) const { + walk(root_, func); +} + +template +const dir_entry* metadata_::find(const char* path) const { + while (*path and *path == '/') { + ++path; + } + + const dir_entry* de = root_; + + while (*path) { + const char* next = ::strchr(path, '/'); + size_t clen = next ? next - path : ::strlen(path); + + de = dir_reader_->find(getdir(de), path, clen); + + if (!de) { + break; + } + + path = next ? next + 1 : path + clen; + } + + return de; +} + +template +const dir_entry* metadata_::find(int inode) const { + return get_entry(inode); +} + +template +const dir_entry* +metadata_::find(int inode, const char* name) const { + auto de = get_entry(inode); + + if (de) { + de = dir_reader_->find(getdir(de), name, ::strlen(name)); + } + + return de; +} + +template +int metadata_::getattr(const dir_entry* de, + struct ::stat* stbuf) const { + ::memset(stbuf, 0, sizeof(*stbuf)); + dir_reader_->getattr(de, stbuf, filesize(de)); + return 0; +} + +template +int metadata_::access(const dir_entry* de, int mode, uid_t uid, + gid_t gid) const { + return dir_reader_->access(de, mode, uid, gid); +} + +template +const directory* metadata_::opendir(const dir_entry* de) const { + if (S_ISDIR(de->mode)) { + return getdir(de); + } + + return nullptr; +} + +template +int metadata_::open(const dir_entry* de) const { + if (S_ISREG(de->mode)) { + return de->inode; + } + + return -1; +} + +template +const dir_entry* +metadata_::readdir(const directory* d, size_t offset, + std::string* name) const { + const dir_entry* de; + + switch (offset) { + case 0: + de = as(d->self); + + if (name) { + name->assign("."); + } + break; + + case 1: + de = as(d->parent); + + if (name) { + name->assign(".."); + } + break; + + default: + offset -= 2; + + if (offset < d->count) { + de = dir_reader_->readdir(d, offset, name); + } else { + return nullptr; + } + + break; + } + + return de; +} + +template +int metadata_::readlink(const dir_entry* de, char* buf, + size_t size) const { + if (S_ISLNK(de->mode)) { + size_t lsize = linksize(de); + + ::memcpy(buf, linkptr(de), std::min(lsize, size)); + + if (size > lsize) { + buf[lsize] = '\0'; + } + + return 0; + } + + return -EINVAL; +} + +template +int metadata_::readlink(const dir_entry* de, + std::string* buf) const { + if (S_ISLNK(de->mode)) { + size_t lsize = linksize(de); + + buf->assign(linkptr(de), lsize); + + return 0; + } + + return -EINVAL; +} + +template +int metadata_::statvfs(struct ::statvfs* stbuf) const { + ::memset(stbuf, 0, sizeof(*stbuf)); + + stbuf->f_bsize = 1UL << cfg_->block_size_bits; + stbuf->f_frsize = 1UL; + stbuf->f_blocks = cfg_->orig_fs_size; + stbuf->f_files = cfg_->inode_count; + stbuf->f_flag = ST_RDONLY; + stbuf->f_namemax = PATH_MAX; + + return 0; +} + +template +const chunk_type* +metadata_::get_chunks(int inode, size_t& num) const { + inode -= inode_offset_; + if (inode < static_cast(cfg_->chunk_index_offset) || + inode >= static_cast(cfg_->inode_count)) { + return nullptr; + } + uint32_t off = chunk_index_[inode]; + num = (chunk_index_[inode + 1] - off) / sizeof(chunk_type); + return as(off); +} + +void metadata::get_stat_defaults(struct ::stat* defaults) { + ::memset(defaults, 0, sizeof(struct ::stat)); + defaults->st_uid = ::geteuid(); + defaults->st_gid = ::getegid(); + time_t t = ::time(nullptr); + defaults->st_atime = t; + defaults->st_mtime = t; + defaults->st_ctime = t; +} +#endif + +metadata_v2::metadata_v2(logger& lgr, std::vector&& data, + const struct ::stat* defaults) + : impl_(make_unique_logging_object(lgr, std::move(data), + defaults)) {} +} // namespace dwarfs diff --git a/src/dwarfs/scanner.cpp b/src/dwarfs/scanner.cpp index d9fd7043..5f8c9040 100644 --- a/src/dwarfs/scanner.cpp +++ b/src/dwarfs/scanner.cpp @@ -58,8 +58,44 @@ #include "dwarfs/script.h" #include "dwarfs/util.h" +#include "dwarfs/gen-cpp2/metadata_layouts.h" +#include "dwarfs/gen-cpp2/metadata_types.h" +#include "dwarfs/gen-cpp2/metadata_types_custom_protocol.h" +#include +#include +#include + namespace dwarfs { +namespace { + +template +std::vector freeze_to_buffer(const T& x) { + using namespace ::apache::thrift::frozen; + + Layout layout; + size_t content_size = LayoutRoot::layout(x, layout); + + std::string schema; + serializeRootLayout(layout, schema); + + size_t schema_size = schema.size(); + auto schema_begin = reinterpret_cast(schema.data()); + std::vector buffer(schema_begin, schema_begin + schema_size); + + size_t buffer_size = schema_size + content_size; + buffer.resize(buffer_size, 0); + + folly::MutableByteRange content_range(&buffer[schema_size], content_size); + ByteRangeFreezer::freeze(layout, x, content_range); + + buffer.resize(buffer.size() - content_range.size()); + + return buffer; +} + +} // namespace + template class scanner_ : public scanner::impl { public: @@ -225,16 +261,18 @@ class set_inode_visitor : public entry_visitor { uint32_t inode_no_ = 0; }; -class save_links_visitor : public entry_visitor { +class names_and_links_visitor : public entry_visitor { public: - save_links_visitor(metadata_writer& mw) - : mw_(mw) {} + names_and_links_visitor(metadata_writer& mw, global_entry_data& data) + : mw_(mw) + , data_(data) {} - void visit(file*) override { - // nothing - } + void visit(file* p) override { data_.add_name(p->name()); } void visit(link* p) override { + data_.add_name(p->name()); + data_.add_link(p->linkname()); + const auto& name = p->linkname(); auto r = offset_.emplace(name, mw_.offset()); if (r.second) { @@ -245,19 +283,26 @@ class save_links_visitor : public entry_visitor { p->set_offset(r.first->second); } - void visit(dir*) override { - // nothing + void visit(dir* p) override { + if (p->has_parent()) { + data_.add_name(p->name()); + } } private: metadata_writer& mw_; + global_entry_data& data_; std::unordered_map offset_; }; class save_directories_visitor : public entry_visitor { public: - save_directories_visitor(metadata_writer& mw, std::vector& index) + save_directories_visitor(metadata_writer& mw, thrift::metadata::metadata& mv2, + global_entry_data const& ge_data, + std::vector& index) : mw_(mw) + , mv2_(mv2) + , ge_data_(ge_data) , cb_([&](const entry* e, size_t offset) { index.at(e->inode_num()) = folly::to(offset); }) {} @@ -271,17 +316,23 @@ class save_directories_visitor : public entry_visitor { } void visit(dir* p) override { + mv2_.dir_link_index.at(p->inode_num()) = mv2_.directories.size(); + p->pack(mv2_, ge_data_); + p->set_offset(mw_.offset()); p->pack(mw_.buffer(p->packed_size()), cb_); if (!p->has_parent()) { cb_(p, mw_.offset()); p->pack_entry(mw_.buffer(p->packed_entry_size())); + p->pack_entry(mv2_, ge_data_); } } private: metadata_writer& mw_; + thrift::metadata::metadata& mv2_; + global_entry_data const& ge_data_; std::function cb_; }; @@ -382,9 +433,8 @@ void scanner_::scan(filesystem_writer& fsw, } // now scan all files - // TODO: automatically adjust # of worker threads based on load root->walk([&](entry* ep) { - wg_.add_job([=, this, &prog] { + wg_.add_job([=, &prog] { if (ep->type() == entry::E_FILE) { prog.current.store(ep); ep->scan(*os_, prog); @@ -480,13 +530,18 @@ void scanner_::scan(filesystem_writer& fsw, log_.info() << "building metadata..."; std::vector metadata_vec; metadata_writer mw(lgr_, metadata_vec); + global_entry_data ge_data; + thrift::metadata::metadata mv2; + mv2.dir_link_index.resize(siv.inode_no()); wg_.add_job([&] { mw.start_section(section_type::META_TABLEDATA); log_.info() << "saving links..."; - save_links_visitor slv(mw); - root->accept(slv); + names_and_links_visitor nlv(mw, ge_data); + root->accept(nlv); + + ge_data.index(); log_.debug() << "link data size = " << mw.section_data_size(); @@ -497,6 +552,11 @@ void scanner_::scan(filesystem_writer& fsw, log_.info() << "updating name offsets..."; root->walk([&](entry* ep) { + ep->update(ge_data); + if (auto lp = dynamic_cast(ep)) { + mv2.dir_link_index.at(ep->inode_num()) = + ge_data.get_link_index(lp->linkname()); + } if (ep->has_parent()) { auto i = name_offset.find(ep->name()); if (i == name_offset.end()) { @@ -536,23 +596,31 @@ void scanner_::scan(filesystem_writer& fsw, log_.debug() << "saved by segmenting: " << size_with_unit(prog.saved_by_segmentation); + // mv2.string_table = std::string( + // reinterpret_cast(mw.section_data()), + // mw.section_data_size()); + // TODO: not sure that's actually needed root->set_name(std::string()); log_.info() << "saving chunks..."; std::vector index; index.resize(im->count() + 1); + mv2.chunk_index.resize(im->count() + 1); // TODO: we should be able to start this once all blocks have been // submitted for compression mw.align(im->chunk_size()); im->for_each_inode([&](std::shared_ptr const& ino) { index.at(ino->num() - siv.inode_no()) = folly::to(mw.offset()); + mv2.chunk_index.at(ino->num() - siv.inode_no()) = mv2.chunks.size(); mw.write(ino->chunks()); + ino->append_chunks(mv2.chunks); }); // insert dummy inode to help determine number of chunks per inode index.at(im->count()) = folly::to(mw.offset()); + mv2.chunk_index.at(im->count()) = mv2.chunks.size(); mw.finish_section(); @@ -568,8 +636,9 @@ void scanner_::scan(filesystem_writer& fsw, log_.info() << "saving directories..."; index.resize(siv.inode_no() + im->count()); + mv2.inode_index.resize(siv.inode_no() + im->count()); mw.start_section(section_type::META_DIRECTORIES); - save_directories_visitor sdv(mw, index); + save_directories_visitor sdv(mw, mv2, ge_data, index); root->accept(sdv); mw.finish_section(); @@ -592,8 +661,37 @@ void scanner_::scan(filesystem_writer& fsw, mw.finish_section(); fsw.write_metadata(std::move(metadata_vec)); + + mv2.uids = ge_data.get_uids(); + mv2.gids = ge_data.get_gids(); + mv2.modes = ge_data.get_modes(); + mv2.names = ge_data.get_names(); + mv2.links = ge_data.get_links(); + mv2.timestamp_base = ge_data.timestamp_base; + mv2.chunk_index_offset = siv.inode_no(); + mv2.total_fs_size = prog.original_size; + + fsw.write_metadata_v2(freeze_to_buffer(mv2)); + fsw.flush(); + // ::apache::thrift::frozen::freezeToFile(mv2, folly::File("metadata.frozen", + // O_RDWR | O_CREAT)); + + // auto mapping = folly::MemoryMapping("metadata.frozen"); + + // ::apache::thrift::frozen::Layout layout; + // ::apache::thrift::frozen::schema::Schema schema; + // auto range = mapping.range(); + // apache::thrift::CompactSerializer::deserialize(range, schema); + + // log_.info() << ::apache::thrift::debugString(schema); + + // auto mapped = + // ::apache::thrift::frozen::mapFrozen(std::move(mapping)); + + // log_.info() << ::apache::thrift::debugString(mapped.thaw()); + log_.info() << "compressed " << size_with_unit(prog.original_size) << " to " << size_with_unit(prog.compressed_size) << " (ratio=" << static_cast(prog.compressed_size) / prog.original_size diff --git a/src/dwarfs/worker_group.cpp b/src/dwarfs/worker_group.cpp index bc8d707b..382d5b66 100644 --- a/src/dwarfs/worker_group.cpp +++ b/src/dwarfs/worker_group.cpp @@ -57,7 +57,7 @@ class basic_worker_group : public worker_group::impl, private Policy { } for (size_t i = 0; i < num_workers; ++i) { - workers_.emplace_back([=, this] { + workers_.emplace_back([=] { folly::setThreadName(folly::to(group_name, i + 1)); do_work(); }); diff --git a/src/dwarfsck.cpp b/src/dwarfsck.cpp index 8ef162a5..7d79d689 100644 --- a/src/dwarfsck.cpp +++ b/src/dwarfsck.cpp @@ -29,7 +29,7 @@ int main(int argc, char** argv) { if (argc == 2 || argc == 3) { try { - dwarfs::stream_logger lgr(std::cerr, dwarfs::logger::INFO); + dwarfs::stream_logger lgr(std::cerr, dwarfs::logger::DEBUG); dwarfs::filesystem fs(lgr, std::make_shared(argv[1]), dwarfs::block_cache_options()); @@ -48,7 +48,8 @@ int main(int argc, char** argv) { dwarfs::filesystem::identify( lgr, std::make_shared(argv[1]), std::cout); // TODO: - // fs.dump(std::cout); + fs.dump(std::cout); + fs.dump_v2(std::cout); } } catch (const std::exception& e) { std::cerr << "Error: " << e.what() << std::endl; diff --git a/src/mkdwarfs.cpp b/src/mkdwarfs.cpp index aadb5bb8..f5d6df5c 100644 --- a/src/mkdwarfs.cpp +++ b/src/mkdwarfs.cpp @@ -217,7 +217,7 @@ int mkdwarfs(int argc, char** argv) { block_manager::config cfg; std::string path, output, window_sizes, memory_limit, script_path, - compression, log_level; + compression, metadata_compression, log_level; size_t num_workers, max_scanner_workers; bool no_time = false, no_owner = false, recompress = false, no_progress = false; @@ -255,6 +255,9 @@ int mkdwarfs(int argc, char** argv) { ("compression,C", po::value(&compression), "block compression algorithm") + ("metadata-compression", + po::value(&metadata_compression), + "metadata compression algorithm (default: same as block compression)") ("recompress", po::value(&recompress)->zero_tokens(), "recompress an existing filesystem") @@ -363,6 +366,10 @@ int mkdwarfs(int argc, char** argv) { compression = defaults.compression; } + if (!vm.count("metadata-compression")) { + metadata_compression = compression; + } + if (!vm.count("blockhash-window-sizes")) { window_sizes = defaults.window_sizes; } @@ -406,8 +413,9 @@ int mkdwarfs(int argc, char** argv) { progress prog([&](const progress& p, bool last) { lgr.update(p, last); }); block_compressor bc(compression); + block_compressor metadata_bc(metadata_compression); std::ofstream ofs(output); - filesystem_writer fsw(ofs, lgr, wg_writer, prog, bc, mem_limit); + filesystem_writer fsw(ofs, lgr, wg_writer, prog, bc, metadata_bc, mem_limit); if (recompress) { auto ti = log.timed_info(); diff --git a/thrift/metadata.thrift b/thrift/metadata.thrift new file mode 100644 index 00000000..b9357d2e --- /dev/null +++ b/thrift/metadata.thrift @@ -0,0 +1,67 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +namespace cpp2 dwarfs.thrift.metadata + +typedef i16 (cpp2.type = "uint16_t") UInt16 +typedef i32 (cpp2.type = "uint32_t") UInt32 +typedef i64 (cpp2.type = "uint64_t") UInt64 + +struct chunk { + 1: required UInt32 block, + 2: required UInt32 offset, + 3: required UInt32 size, +} + +struct directory { + 1: required UInt32 self_inode, + 2: required UInt32 parent_inode, + 3: required UInt32 first_entry, + 4: required UInt32 entry_count, +} + +struct entry { + 1: required UInt32 name_index, + 2: required UInt16 mode, + 3: required UInt32 inode, + 4: required UInt16 owner, + 5: required UInt16 group, + 6: required UInt64 atime, + 7: required UInt64 mtime, + 8: required UInt64 ctime, +} + +struct metadata { + 1: required list chunks, + 2: required list chunk_index, + 3: required list directories, + 4: required list entries, + 5: required list inode_index, + 6: required list dir_link_index, + 7: required list uids, + 8: required list gids, + 9: required list modes, + 10: required list names, + 11: required list links, + 12: required UInt64 timestamp_base, + 13: required UInt32 chunk_index_offset; + 14: required UInt64 total_fs_size; +}