From e3a4f8db09c2c788b85a9e5961ad2e9fb6575357 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Sat, 28 Nov 2020 00:54:11 +0100 Subject: [PATCH] metadata_v2: v1 removal, part 1 --- src/dwarfs.cpp | 95 +++++----------- src/dwarfs/scanner.cpp | 247 +++-------------------------------------- src/dwarfsck.cpp | 15 +-- src/mkdwarfs.cpp | 4 +- test/dwarfs.cpp | 46 +++----- 5 files changed, 65 insertions(+), 342 deletions(-) diff --git a/src/dwarfs.cpp b/src/dwarfs.cpp index dae60310..ebdc806d 100644 --- a/src/dwarfs.cpp +++ b/src/dwarfs.cpp @@ -32,14 +32,7 @@ #include -#define USE_META_V2 - -#ifdef USE_META_V2 #include "dwarfs/filesystem_v2.h" -#else -#include "dwarfs/filesystem.h" -#endif - #include "dwarfs/metadata_v2.h" #include "dwarfs/mmap.h" #include "dwarfs/options.h" @@ -78,16 +71,9 @@ const struct fuse_opt dwarfs_opts[] = { DWARFS_OPT("workers=%s", workers_str, 0), DWARFS_OPT("decratio=%s", decompress_ratio_str, 0), FUSE_OPT_END}; -#ifdef USE_META_V2 -using filesystem = filesystem_v2; -#define ENTRY_V2(e) (*(e)) -#else -#define ENTRY_V2(e) (e) -#endif - options opts; stream_logger s_lgr(std::cerr); -std::shared_ptr s_fs; +std::shared_ptr s_fs; void op_init(void* /*userdata*/, struct fuse_conn_info* /*conn*/) { DEBUG_FUNC("") @@ -96,7 +82,7 @@ void op_init(void* /*userdata*/, struct fuse_conn_info* /*conn*/) { bco.num_workers = opts.workers; bco.decompress_ratio = opts.decompress_ratio; s_fs = - std::make_shared(s_lgr, std::make_shared(opts.fsimage), + std::make_shared(s_lgr, std::make_shared(opts.fsimage), bco, &opts.stat_defaults, FUSE_ROOT_ID); } @@ -111,12 +97,12 @@ void op_lookup(fuse_req_t req, fuse_ino_t parent, const char* name) { int err = ENOENT; try { - auto de = s_fs->find(parent, name); + auto entry = s_fs->find(parent, name); - if (de) { + if (entry) { struct ::fuse_entry_param e; - err = s_fs->getattr(ENTRY_V2(de), &e.attr); + err = s_fs->getattr(*entry, &e.attr); if (err == 0) { e.generation = 1; @@ -147,12 +133,12 @@ void op_getattr(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info*) { // TODO: merge with op_lookup try { - auto de = s_fs->find(ino); + auto entry = s_fs->find(ino); - if (de) { + if (entry) { struct ::stat stbuf; - err = s_fs->getattr(ENTRY_V2(de), &stbuf); + err = s_fs->getattr(*entry, &stbuf); if (err == 0) { fuse_reply_attr(req, &stbuf, std::numeric_limits::max()); @@ -178,11 +164,11 @@ void op_access(fuse_req_t req, fuse_ino_t ino, int mode) { // TODO: merge with op_lookup try { - auto de = s_fs->find(ino); + auto entry = s_fs->find(ino); - if (de) { + if (entry) { auto ctx = fuse_req_ctx(req); - err = s_fs->access(ENTRY_V2(de), mode, ctx->uid, ctx->gid); + err = s_fs->access(*entry, mode, ctx->uid, ctx->gid); } } catch (const dwarfs::error& e) { std::cerr << "ERROR: " << e.what() << std::endl; @@ -201,12 +187,12 @@ void op_readlink(fuse_req_t req, fuse_ino_t ino) { int err = ENOENT; try { - auto de = s_fs->find(ino); + auto entry = s_fs->find(ino); - if (de) { + if (entry) { std::string str; - err = s_fs->readlink(ENTRY_V2(de), &str); + err = s_fs->readlink(*entry, &str); if (err == 0) { fuse_reply_readlink(req, str.c_str()); @@ -231,23 +217,15 @@ void op_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info* fi) { int err = ENOENT; try { - auto de = s_fs->find(ino); + auto entry = s_fs->find(ino); - if (de) { -#ifdef USE_META_V2 - if (S_ISDIR(de->mode())) { -#else - if (S_ISDIR(de->mode)) { -#endif + if (entry) { + if (S_ISDIR(entry->mode())) { err = EISDIR; } else if (fi->flags & (O_APPEND | O_CREAT | O_TRUNC)) { err = EACCES; } else { -#ifdef USE_META_V2 - fi->fh = FUSE_ROOT_ID + de->inode(); -#else - fi->fh = reinterpret_cast(de); -#endif + fi->fh = FUSE_ROOT_ID + entry->inode(); fi->keep_cache = 1; fuse_reply_open(req, fi); return; @@ -271,13 +249,7 @@ void op_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, int err = ENOENT; try { -#ifdef USE_META_V2 assert(fi->fh == ino); -#else - auto de = reinterpret_cast(fi->fh); - - if (de) { -#endif iovec_read_buf buf; ssize_t rv = s_fs->readv(ino, buf, size, off); @@ -291,9 +263,6 @@ void op_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, } err = -rv; -#ifndef USE_META_V2 - } -#endif } catch (const dwarfs::error& e) { std::cerr << "ERROR: " << e.what() << std::endl; @@ -314,33 +283,25 @@ void op_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, int err = ENOENT; try { - auto de = s_fs->find(ino); + auto dirent = s_fs->find(ino); - if (de) { - auto d = s_fs->opendir(ENTRY_V2(de)); + if (dirent) { + auto dir = s_fs->opendir(*dirent); - if (d) { - off_t lastoff = s_fs->dirsize(ENTRY_V2(d)); -#ifndef USE_META_V2 - std::string name; -#endif + if (dir) { + off_t lastoff = s_fs->dirsize(*dir); struct stat stbuf; std::vector buf(size); size_t written = 0; while (off < lastoff) { -#ifdef USE_META_V2 - auto res = s_fs->readdir(*d, off); + auto res = s_fs->readdir(*dir, off); assert(res); - auto [de2, name_view] = *res; - std::string name(name_view); -#else - auto de2 = s_fs->readdir(d, off, &name); -#endif - s_fs->getattr(de2, &stbuf); - /// std::cerr << ">>> " << off << "/" << lastoff << " - " << name << " - /// - " << stbuf.st_ino << std::endl; + auto [entry, name_view] = *res; + std::string name(name_view); + + s_fs->getattr(entry, &stbuf); size_t needed = fuse_add_direntry(req, &buf[written], buf.size() - written, diff --git a/src/dwarfs/scanner.cpp b/src/dwarfs/scanner.cpp index 6646120c..84698cd1 100644 --- a/src/dwarfs/scanner.cpp +++ b/src/dwarfs/scanner.cpp @@ -19,37 +19,22 @@ * along with dwarfs. If not, see . */ -#include #include #include #include -#include -#include -#include -#include #include -#include #include #include #include -#include -#include -#include - -#include -#include - #include "dwarfs/config.h" -#include "dwarfs/cyclic_hash.h" #include "dwarfs/entry.h" #include "dwarfs/filesystem_writer.h" #include "dwarfs/fstypes.h" #include "dwarfs/hash_util.h" #include "dwarfs/inode_manager.h" #include "dwarfs/logger.h" -#include "dwarfs/metadata.h" #include "dwarfs/metadata_v2.h" #include "dwarfs/metadata_writer.h" #include "dwarfs/options.h" @@ -59,12 +44,6 @@ #include "dwarfs/script.h" #include "dwarfs/util.h" -#include "dwarfs/gen-cpp2/metadata_layouts.h" -#include "dwarfs/gen-cpp2/metadata_types.h" -#include "dwarfs/gen-cpp2/metadata_types_custom_protocol.h" -#include -#include - namespace dwarfs { template @@ -77,28 +56,6 @@ class scanner_ : public scanner::impl { void scan(filesystem_writer& fsw, const std::string& path, progress& prog); private: - template > - class fast_hash_map : public google::dense_hash_map { - public: - fast_hash_map() { this->set_empty_key(Key()); } - }; - - template > - class fast_hash_set : public google::dense_hash_set { - public: - fast_hash_set() { this->set_empty_key(T()); } - }; - - // We want these to be ordered - // TODO: StringPiece? - // TODO: Use dense/unordered maps/sets and sort later? - using file_name_table_t = - fast_hash_map>; - - std::unordered_map - compress_names_table(metadata_writer& mw, - const file_name_table_t& file_name) const; - const block_manager::config& cfg_; const scanner_options& options_; std::shared_ptr entry_; @@ -125,94 +82,6 @@ scanner_::scanner_(logger& lgr, worker_group& wg, , lgr_(lgr) , log_(lgr) {} -template -std::unordered_map -scanner_::compress_names_table( - metadata_writer& mw, const file_name_table_t& file_name) const { - log_.info() << "compressing names table..."; - auto ti = log_.timed_info(); - - google::dense_hash_map index; - using position_vector = folly::small_vector; - std::vector positions; - index.set_empty_key(0); - uint32_t index_pos = 0; - - std::unordered_map offset; - size_t saved = 0; - size_t orig_offset = mw.offset(); - - std::vector sizes(file_name.size()); - std::transform(file_name.begin(), file_name.end(), sizes.begin(), - [](const auto& p) { return p.first; }); - std::sort(sizes.begin(), sizes.end(), std::greater()); - - for (auto size : sizes) { - auto nsi = file_name.find(size); - assert(nsi != file_name.end()); - std::vector names(nsi->second.size()); - std::copy(nsi->second.begin(), nsi->second.end(), names.begin()); - std::sort(names.begin(), names.end()); - - for (auto k : names) { - bool found = false; - - if (!index.empty() && k.size() >= sizeof(uint32_t)) { - uint32_t key; - std::memcpy(&key, k.data(), sizeof(key)); - auto it = index.find(key); - if (it != index.end()) { - for (uint32_t pos : positions[it->second]) { - if (std::memcmp(mw.section_data() + pos + sizeof(key), - k.data() + sizeof(key), - k.size() - sizeof(key)) == 0) { - offset[k] = mw.section_data_offset() + pos; - saved += k.size(); - found = true; - break; - } - } - } - } else { - auto it = std::search(mw.section_begin(), mw.end(), k.begin(), k.end()); - - if (it != mw.end()) { - offset[k] = it - mw.begin(); - saved += k.size(); - found = true; - } - } - - if (!found) { - offset[k] = mw.offset(); - mw.write(k); - - if (mw.section_data_size() >= sizeof(uint32_t)) { - uint32_t last = mw.section_data_size() - sizeof(uint32_t); - while (index_pos <= last) { - uint32_t key; - std::memcpy(&key, mw.section_data() + index_pos, sizeof(key)); - auto r = index.insert(std::make_pair(key, positions.size())); - uint32_t pos_index; - if (r.second) { - pos_index = positions.size(); - positions.resize(pos_index + 1); - } else { - pos_index = r.first->second; - } - positions[pos_index].push_back(index_pos++); - } - } - } - } - } - - ti << "names table: " << size_with_unit(mw.offset() - orig_offset) << " (" - << size_with_unit(saved) << " saved)"; - - return offset; -} - class dir_set_inode_visitor : public entry_visitor { public: dir_set_inode_visitor(uint32_t& inode_no) @@ -250,24 +119,14 @@ class link_set_inode_visitor : public entry_visitor { class names_and_links_visitor : public entry_visitor { public: - names_and_links_visitor(metadata_writer& mw, global_entry_data& data) - : mw_(mw) - , data_(data) {} + names_and_links_visitor(global_entry_data& data) + : data_(data) {} void visit(file* p) override { data_.add_name(p->name()); } void visit(link* p) override { data_.add_name(p->name()); data_.add_link(p->linkname()); - - const auto& name = p->linkname(); - auto r = offset_.emplace(name, mw_.offset()); - if (r.second) { - uint16_t len = folly::to(name.size()); - mw_.write(len); - mw_.write(name); - } - p->set_offset(r.first->second); } void visit(dir* p) override { @@ -277,24 +136,17 @@ class names_and_links_visitor : public entry_visitor { } private: - metadata_writer& mw_; global_entry_data& data_; - std::unordered_map offset_; }; class save_directories_visitor : public entry_visitor { public: - save_directories_visitor(metadata_writer& mw, thrift::metadata::metadata& mv2, + save_directories_visitor(thrift::metadata::metadata& mv2, global_entry_data const& ge_data, - std::vector& dir_index, - std::vector& index) - : mw_(mw) - , mv2_(mv2) + std::vector& dir_index) + : mv2_(mv2) , ge_data_(ge_data) - , dir_index_(dir_index) - , cb_([&](const entry* e, size_t offset) { - index.at(e->inode_num()) = folly::to(offset); - }) {} + , dir_index_(dir_index) {} void visit(file*) override { // nothing @@ -308,22 +160,15 @@ class save_directories_visitor : public entry_visitor { dir_index_.at(p->inode_num()) = mv2_.directories.size(); p->pack(mv2_, ge_data_); - p->set_offset(mw_.offset()); - p->pack(mw_.buffer(p->packed_size()), cb_); - if (!p->has_parent()) { - cb_(p, mw_.offset()); - p->pack_entry(mw_.buffer(p->packed_entry_size())); p->pack_entry(mv2_, ge_data_); } } private: - metadata_writer& mw_; thrift::metadata::metadata& mv2_; global_entry_data const& ge_data_; std::vector& dir_index_; - std::function cb_; }; template @@ -436,20 +281,14 @@ void scanner_::scan(filesystem_writer& fsw, log_.info() << "waiting for background scanners..."; wg_.wait(); - size_t total{0}; std::unordered_map, folly::Hash> file_hash; - file_name_table_t file_name; + // TODO: turn into visitor? root->walk([&](entry* ep) { if (auto fp = dynamic_cast(ep)) { file_hash[fp->hash()].push_back(fp); } - if (ep->has_parent()) { - const std::string& name = ep->name(); - file_name[name.size()].insert(name); - total += name.size(); - } }); log_.info() << "finding duplicate files..."; @@ -522,31 +361,22 @@ void scanner_::scan(filesystem_writer& fsw, im->number_inodes(first_file_inode); log_.info() << "building metadata..."; - std::vector metadata_vec; - metadata_writer mw(lgr_, metadata_vec); + global_entry_data ge_data( options_.no_time); // TODO: just pass options directly + thrift::metadata::metadata mv2; std::vector dir_index; dir_index.resize(first_link_inode); mv2.link_index.resize(first_file_inode - first_link_inode); wg_.add_job([&] { - mw.start_section(section_type::META_TABLEDATA); - log_.info() << "saving links..."; - names_and_links_visitor nlv(mw, ge_data); + names_and_links_visitor nlv(ge_data); root->accept(nlv); ge_data.index(); - log_.debug() << "link data size = " << mw.section_data_size(); - - log_.info() << "saving names..."; - auto name_offset = compress_names_table(mw, file_name); - - log_.debug() << "name data size = " << mw.section_data_size(); - log_.info() << "updating name offsets..."; root->walk([&](entry* ep) { ep->update(ge_data); @@ -554,13 +384,6 @@ void scanner_::scan(filesystem_writer& fsw, mv2.link_index.at(ep->inode_num() - first_link_inode) = ge_data.get_link_index(lp->linkname()); } - if (ep->has_parent()) { - auto i = name_offset.find(ep->name()); - if (i == name_offset.end()) { - throw std::runtime_error("offset not found for entry name"); - } - ep->set_name_offset(i->second); - } }); }); @@ -593,74 +416,32 @@ void scanner_::scan(filesystem_writer& fsw, log_.debug() << "saved by segmenting: " << size_with_unit(prog.saved_by_segmentation); - // mv2.string_table = std::string( - // reinterpret_cast(mw.section_data()), - // mw.section_data_size()); - - // TODO: not sure that's actually needed + // this is actually needed root->set_name(std::string()); log_.info() << "saving chunks..."; - std::vector index; - index.resize(im->count() + 1); mv2.chunk_index.resize(im->count() + 1); // TODO: we should be able to start this once all blocks have been // submitted for compression - mw.align(im->chunk_size()); im->for_each_inode([&](std::shared_ptr const& ino) { - index.at(ino->num() - first_file_inode) = folly::to(mw.offset()); mv2.chunk_index.at(ino->num() - first_file_inode) = mv2.chunks.size(); - mw.write(ino->chunks()); ino->append_chunks(mv2.chunks); }); // insert dummy inode to help determine number of chunks per inode - index.at(im->count()) = folly::to(mw.offset()); mv2.chunk_index.at(im->count()) = mv2.chunks.size(); - mw.finish_section(); - - size_t num_chunks = (index.back() - index.front()) / sizeof(chunk_type); - log_.debug() << "total number of file inodes: " << im->count(); - log_.debug() << "total number of chunks: " << num_chunks; - - log_.info() << "saving chunk index..."; - mw.start_section(section_type::META_CHUNK_INDEX); - mw.write(index); - mw.finish_section(); + log_.debug() << "total number of chunks: " << mv2.chunks.size(); log_.info() << "saving directories..."; - index.resize(first_file_inode + im->count()); mv2.entry_index.resize(first_file_inode + im->count()); - mw.start_section(section_type::META_DIRECTORIES); - save_directories_visitor sdv(mw, mv2, ge_data, dir_index, index); + save_directories_visitor sdv(mv2, ge_data, dir_index); root->accept(sdv); - mw.finish_section(); - - log_.info() << "saving inode index..."; - mw.start_section(section_type::META_INODE_INDEX); - mw.write(index); - mw.finish_section(); - - log_.info() << "saving metadata config..."; - mw.start_section(section_type::META_CONFIG); - meta_config mconf; - mconf.block_size_bits = folly::to(im->block_size_bits()); - mconf.de_type = entry_->de_type(); - mconf.unused = 0; - mconf.inode_count = first_file_inode + im->count(); - mconf.orig_fs_size = prog.original_size; - mconf.chunk_index_offset = first_file_inode; - mconf.inode_index_offset = 0; - mw.write(mconf); - mw.finish_section(); - - // TODO: remove all metadata v1 code - // fsw.write_metadata(std::move(metadata_vec)); { + // order directories by inode number std::vector tmp = std::move(mv2.directories); mv2.directories.reserve(tmp.size()); for (auto i : dir_index) { diff --git a/src/dwarfsck.cpp b/src/dwarfsck.cpp index 9a3e7efd..59295de7 100644 --- a/src/dwarfsck.cpp +++ b/src/dwarfsck.cpp @@ -22,7 +22,6 @@ #include #include -#include "dwarfs/filesystem.h" #include "dwarfs/filesystem_v2.h" #include "dwarfs/mmap.h" #include "dwarfs/options.h" @@ -32,26 +31,22 @@ int main(int argc, char** argv) { try { dwarfs::stream_logger lgr(std::cerr, dwarfs::logger::DEBUG); auto mm = std::make_shared(argv[1]); - dwarfs::filesystem fs(lgr, mm, dwarfs::block_cache_options()); - dwarfs::filesystem_v2 fs_v2(lgr, mm, dwarfs::block_cache_options()); + dwarfs::filesystem_v2 fs(lgr, mm, dwarfs::block_cache_options()); if (argc == 3) { - auto de = fs.find(argv[2]); + auto entry = fs.find(argv[2]); - if (de) { + if (entry) { struct ::stat stbuf; - fs.getattr(de, &stbuf); + fs.getattr(*entry, &stbuf); std::vector data(stbuf.st_size); fs.read(stbuf.st_ino, &data[0], data.size(), 0); std::cout.write(&data[0], data.size()); } } else { // TODO: add more usage options... - dwarfs::filesystem::identify(lgr, mm, std::cout); - fs.dump(std::cout); - dwarfs::filesystem_v2::identify(lgr, mm, std::cout); - fs_v2.dump(std::cout); + fs.dump(std::cout); } } catch (const std::exception& e) { std::cerr << "Error: " << e.what() << std::endl; diff --git a/src/mkdwarfs.cpp b/src/mkdwarfs.cpp index 7aaa203c..4ee24b69 100644 --- a/src/mkdwarfs.cpp +++ b/src/mkdwarfs.cpp @@ -47,7 +47,7 @@ #include "dwarfs/block_manager.h" #include "dwarfs/console_writer.h" #include "dwarfs/entry.h" -#include "dwarfs/filesystem.h" +#include "dwarfs/filesystem_v2.h" #include "dwarfs/filesystem_writer.h" #include "dwarfs/logger.h" #include "dwarfs/lua_script.h" @@ -458,7 +458,7 @@ int mkdwarfs(int argc, char** argv) { if (recompress) { auto ti = log.timed_info(); - filesystem::rewrite(lgr, prog, std::make_shared(path), fsw); + filesystem_v2::rewrite(lgr, prog, std::make_shared(path), fsw); wg_writer.wait(); ti << "filesystem rewritten"; } else { diff --git a/test/dwarfs.cpp b/test/dwarfs.cpp index e3984eea..c48125fb 100644 --- a/test/dwarfs.cpp +++ b/test/dwarfs.cpp @@ -26,7 +26,6 @@ #include "dwarfs/block_compressor.h" #include "dwarfs/entry.h" -#include "dwarfs/filesystem.h" #include "dwarfs/filesystem_v2.h" #include "dwarfs/filesystem_writer.h" #include "dwarfs/logger.h" @@ -163,31 +162,6 @@ using namespace dwarfs; namespace { -dir_entry const* get_entry(dir_entry const* de) { return de; } - -entry_view get_entry(std::optional entry) { return *entry; } - -template -void test_created_filesystem(T const& fs) { - auto de = fs.find("/foo.pl"); - struct ::stat st; - - ASSERT_TRUE(de); - - auto entry = get_entry(de); - - EXPECT_EQ(fs.getattr(entry, &st), 0); - EXPECT_EQ(st.st_size, 23456); - - int inode = fs.open(entry); - EXPECT_GE(inode, 0); - - std::vector buf(st.st_size); - ssize_t rv = fs.read(inode, &buf[0], st.st_size, 0); - EXPECT_EQ(rv, st.st_size); - EXPECT_EQ(std::string(buf.begin(), buf.end()), test::loremipsum(st.st_size)); -} - void basic_end_to_end_test(const std::string& compressor, unsigned block_size_bits, file_order_mode file_order, bool no_owner, bool no_time) { @@ -225,11 +199,23 @@ void basic_end_to_end_test(const std::string& compressor, block_cache_options bco; bco.max_bytes = 1 << 20; - filesystem fs(lgr, mm, bco); - test_created_filesystem(fs); + filesystem_v2 fs(lgr, mm, bco); - filesystem_v2 fs_v2(lgr, mm, bco); - test_created_filesystem(fs_v2); + auto entry = fs.find("/foo.pl"); + struct ::stat st; + + ASSERT_TRUE(entry); + + EXPECT_EQ(fs.getattr(*entry, &st), 0); + EXPECT_EQ(st.st_size, 23456); + + int inode = fs.open(*entry); + EXPECT_GE(inode, 0); + + std::vector buf(st.st_size); + ssize_t rv = fs.read(inode, &buf[0], st.st_size, 0); + EXPECT_EQ(rv, st.st_size); + EXPECT_EQ(std::string(buf.begin(), buf.end()), test::loremipsum(st.st_size)); } std::vector const compressions{"null",