diff --git a/CMakeLists.txt b/CMakeLists.txt index e469692b..565a2de5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -225,6 +225,7 @@ list( add_library( thrift_light + ${CMAKE_CURRENT_SOURCE_DIR}/fbthrift/thrift/lib/cpp2/FieldRef.cpp ${CMAKE_CURRENT_SOURCE_DIR}/fbthrift/thrift/lib/cpp2/protocol/CompactProtocol.cpp ${CMAKE_CURRENT_SOURCE_DIR}/fbthrift/thrift/lib/cpp2/protocol/BinaryProtocol.cpp ${CMAKE_CURRENT_SOURCE_DIR}/fbthrift/thrift/lib/cpp2/protocol/DebugProtocol.cpp diff --git a/include/dwarfs/entry.h b/include/dwarfs/entry.h index 5ee91e41..7eec1e25 100644 --- a/include/dwarfs/entry.h +++ b/include/dwarfs/entry.h @@ -46,6 +46,7 @@ class metadata; class file; class link; class dir; +class device; class inode; class os_access; class progress; @@ -55,13 +56,14 @@ class entry_visitor { public: virtual ~entry_visitor() = default; virtual void visit(file* p) = 0; + virtual void visit(device* p) = 0; virtual void visit(link* p) = 0; virtual void visit(dir* p) = 0; }; class entry : public file_interface { public: - enum type_t { E_FILE, E_DIR, E_LINK }; + enum type_t { E_FILE, E_DIR, E_LINK, E_DEVICE, E_OTHER }; entry(const std::string& name, std::shared_ptr parent, const struct ::stat& st); @@ -156,6 +158,25 @@ class link : public entry { uint32_t inode_{0}; }; +/** + * A `device` actually represents anything that's not a file, + * dir or link. + */ +class device : public entry { + public: + using entry::entry; + + type_t type() const override; + void set_inode(uint32_t inode); + void accept(entry_visitor& v, bool preorder) override; + uint32_t inode_num() const override { return inode_; } + void scan(os_access& os, progress& prog) override; + uint64_t device_id() const; + + private: + uint32_t inode_{0}; +}; + class entry_factory { public: static std::unique_ptr create(bool with_similarity = false); diff --git a/include/dwarfs/progress.h b/include/dwarfs/progress.h index 42faf614..29cffce6 100644 --- a/include/dwarfs/progress.h +++ b/include/dwarfs/progress.h @@ -58,6 +58,7 @@ class progress { std::atomic dirs_scanned{0}; std::atomic links_found{0}; std::atomic links_scanned{0}; + std::atomic specials_found{0}; std::atomic duplicate_files{0}; std::atomic block_count{0}; std::atomic chunk_count{0}; diff --git a/src/dwarfs/entry.cpp b/src/dwarfs/entry.cpp index 9e7c591c..d6de18f5 100644 --- a/src/dwarfs/entry.cpp +++ b/src/dwarfs/entry.cpp @@ -73,6 +73,10 @@ std::string entry::type_string() const { return "link"; case E_DIR: return "dir"; + case E_DEVICE: + return "device"; + case E_OTHER: + return "pipe/socket"; default: throw std::runtime_error("invalid file type"); } @@ -220,6 +224,19 @@ void link::scan(os_access& os, progress& prog) { prog.original_size += size(); } +entry::type_t device::type() const { + auto mode = status().st_mode; + return S_ISCHR(mode) || S_ISBLK(mode) ? E_DEVICE : E_OTHER; +} + +void device::set_inode(uint32_t inode) { inode_ = inode; } + +void device::accept(entry_visitor& v, bool) { v.visit(this); } + +void device::scan(os_access&, progress&) {} + +uint64_t device::device_id() const { return status().st_rdev; } + class entry_factory_ : public entry_factory { public: entry_factory_(bool with_similarity) @@ -231,14 +248,18 @@ class entry_factory_ : public entry_factory { struct ::stat st; os.lstat(p, &st); + auto mode = st.st_mode; - if (S_ISREG(st.st_mode)) { + if (S_ISREG(mode)) { return std::make_shared(name, std::move(parent), st, with_similarity_); - } else if (S_ISDIR(st.st_mode)) { + } else if (S_ISDIR(mode)) { return std::make_shared(name, std::move(parent), st); - } else if (S_ISLNK(st.st_mode)) { + } else if (S_ISLNK(mode)) { return std::make_shared(name, std::move(parent), st); + } else if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode) || + S_ISSOCK(mode)) { + return std::make_shared(name, std::move(parent), st); } else { // TODO: warn } diff --git a/src/dwarfs/metadata_v2.cpp b/src/dwarfs/metadata_v2.cpp index 037f1ff4..bb362010 100644 --- a/src/dwarfs/metadata_v2.cpp +++ b/src/dwarfs/metadata_v2.cpp @@ -34,6 +34,8 @@ #include #include +#include + #include "dwarfs/logger.h" #include "dwarfs/metadata_v2.h" @@ -103,11 +105,12 @@ class metadata_ : public metadata_v2::impl { , root_(meta_.entries()[meta_.entry_index()[0]], &meta_) , log_(lgr) , inode_offset_(inode_offset) - , chunk_index_offset_( - find_index_offset(meta_.entry_index().size(), - [](uint16_t mode) { return S_ISREG(mode); })) - , link_index_offset_(find_index_offset( - chunk_index_offset_, [](uint16_t mode) { return S_ISLNK(mode); })) { + , link_index_offset_(find_index_offset(inode_rank::INO_LNK)) + , chunk_index_offset_(find_index_offset(inode_rank::INO_REG)) + , dev_index_offset_(find_index_offset(inode_rank::INO_DEV)) { + log_.debug() << "link index offset: " << link_index_offset_; + log_.debug() << "chunk index offset: " << chunk_index_offset_; + log_.debug() << "device index offset: " << dev_index_offset_; } void dump(std::ostream& os, int detail_level, @@ -159,15 +162,63 @@ class metadata_ : public metadata_v2::impl { return make_entry_view(meta_.entry_index()[inode]); } - template - size_t find_index_offset(size_t last, Func&& func) const { - auto range = boost::irange(size_t(0), last); + // This represents the order in which inodes are stored in entry_index + enum class inode_rank { + INO_DIR, + INO_LNK, + INO_REG, + INO_DEV, + INO_OTH, + }; - auto it = - std::upper_bound(range.begin(), range.end(), 0, [&](int, auto inode) { - auto e = make_entry_view_from_inode(inode); - return bool(func(e.mode())); - }); + static inode_rank get_inode_rank(uint16_t mode) { + switch ((mode)&S_IFMT) { + case S_IFDIR: + return inode_rank::INO_DIR; + case S_IFLNK: + return inode_rank::INO_LNK; + case S_IFREG: + return inode_rank::INO_REG; + case S_IFBLK: + case S_IFCHR: + return inode_rank::INO_DEV; + case S_IFSOCK: + case S_IFIFO: + return inode_rank::INO_OTH; + default: + throw std::runtime_error(fmt::format("unknown file type: {:#06x}", mode)); + } + } + + static char get_filetype_label(uint16_t mode) { + switch ((mode)&S_IFMT) { + case S_IFDIR: + return 'd'; + case S_IFLNK: + return 'l'; + case S_IFREG: + return '-'; + case S_IFBLK: + return 'b'; + case S_IFCHR: + return 'c'; + case S_IFSOCK: + return 's'; + case S_IFIFO: + return 'p'; + default: + throw std::runtime_error(fmt::format("unknown file type: {:#06x}", mode)); + } + } + + size_t find_index_offset(inode_rank rank) const { + auto range = boost::irange(size_t(0), meta_.entry_index().size()); + + auto it = std::lower_bound(range.begin(), range.end(), rank, + [&](auto inode, inode_rank r) { + auto e = make_entry_view_from_inode(inode); + return get_inode_rank(e.mode()) < r; + }); return *it; } @@ -226,13 +277,22 @@ class metadata_ : public metadata_v2::impl { .links()[meta_.link_index()[entry.inode() - link_index_offset_]]; } + uint64_t get_device_id(int inode) const { + if (auto devs = meta_.devices()) { + return (*devs)[inode - dev_index_offset_]; + } + log_.error() << "get_device_id() called, but no devices in file system"; + return 0; + } + folly::ByteRange data_; MappedFrozen meta_; entry_view root_; log_proxy log_; const int inode_offset_; - const int chunk_index_offset_; const int link_index_offset_; + const int chunk_index_offset_; + const int dev_index_offset_; }; template @@ -262,8 +322,14 @@ void metadata_::dump( std::move(icb)); } else if (S_ISLNK(mode)) { os << " -> " << link_value(entry) << "\n"; - } else { - os << " (unknown type)\n"; + } else if (S_ISBLK(mode)) { + os << " (block device: " << get_device_id(inode) << ")\n"; + } else if (S_ISCHR(mode)) { + os << " (char device: " << get_device_id(inode) << ")\n"; + } else if (S_ISFIFO(mode)) { + os << " (named pipe)\n"; + } else if (S_ISSOCK(mode)) { + os << " (socket)\n"; } } @@ -315,7 +381,7 @@ std::string metadata_::modestring(uint16_t mode) const { oss << (mode & S_ISUID ? 'U' : '-'); oss << (mode & S_ISGID ? 'G' : '-'); oss << (mode & S_ISVTX ? 'S' : '-'); - oss << (S_ISDIR(mode) ? 'd' : S_ISLNK(mode) ? 'l' : '-'); + oss << get_filetype_label(mode); oss << (mode & S_IRUSR ? 'r' : '-'); oss << (mode & S_IWUSR ? 'w' : '-'); oss << (mode & S_IXUSR ? 'x' : '-'); @@ -419,10 +485,11 @@ int metadata_::getattr(entry_view entry, auto mode = entry.mode(); auto timebase = meta_.timestamp_base(); + auto inode = entry.inode(); stbuf->st_mode = mode; stbuf->st_size = file_size(entry, mode); - stbuf->st_ino = entry.inode() + inode_offset_; + stbuf->st_ino = inode + inode_offset_; stbuf->st_blocks = (stbuf->st_size + 511) / 512; stbuf->st_uid = entry.getuid(); stbuf->st_gid = entry.getgid(); @@ -430,6 +497,10 @@ int metadata_::getattr(entry_view entry, stbuf->st_mtime = timebase + entry.mtime_offset(); stbuf->st_ctime = timebase + entry.ctime_offset(); + if (S_ISBLK(mode) || S_ISCHR(mode)) { + stbuf->st_rdev = get_device_id(inode); + } + return 0; } diff --git a/src/dwarfs/scanner.cpp b/src/dwarfs/scanner.cpp index 5492f94d..6414f7f1 100644 --- a/src/dwarfs/scanner.cpp +++ b/src/dwarfs/scanner.cpp @@ -64,6 +64,7 @@ class visitor_base : public entry_visitor { void visit(file*) override {} void visit(link*) override {} void visit(dir*) override {} + void visit(device*) override {} }; class scan_files_visitor : public visitor_base { @@ -150,6 +151,40 @@ class link_set_inode_visitor : public visitor_base { uint32_t& inode_no_; }; +class device_set_inode_visitor : public visitor_base { + public: + device_set_inode_visitor(uint32_t& inode_no) + : inode_no_(inode_no) {} + + void visit(device* p) override { + if (p->type() == entry::E_DEVICE) { + p->set_inode(inode_no_++); + dev_ids_.push_back(p->device_id()); + } + } + + std::vector& device_ids() { return dev_ids_; } + + private: + std::vector dev_ids_; + uint32_t& inode_no_; +}; + +class pipe_set_inode_visitor : public visitor_base { + public: + pipe_set_inode_visitor(uint32_t& inode_no) + : inode_no_(inode_no) {} + + void visit(device* p) override { + if (p->type() != entry::E_DEVICE) { + p->set_inode(inode_no_++); + } + } + + private: + uint32_t& inode_no_; +}; + class names_and_links_visitor : public entry_visitor { public: names_and_links_visitor(global_entry_data& data) @@ -157,6 +192,8 @@ class names_and_links_visitor : public entry_visitor { void visit(file* p) override { data_.add_name(p->name()); } + void visit(device* p) override { data_.add_name(p->name()); } + void visit(link* p) override { data_.add_name(p->name()); data_.add_link(p->linkname()); @@ -339,6 +376,12 @@ scanner_::scan_tree(const std::string& path, progress& prog) { prog.links_scanned++; break; + case entry::E_DEVICE: + case entry::E_OTHER: + prog.specials_found++; + pe->scan(*os_, prog); + break; + default: log_.error() << "unsupported entry type: " << int(pe->type()); prog.errors++; @@ -444,13 +487,24 @@ void scanner_::scan(filesystem_writer& fsw, log_.info() << "assigning file inodes..."; im->number_inodes(first_file_inode); - log_.info() << "building metadata..."; - global_entry_data ge_data(options_); - thrift::metadata::metadata mv2; + mv2.link_index.resize(first_file_inode - first_link_inode); + log_.info() << "assigning device inodes..."; + uint32_t first_device_inode = first_file_inode + im->count(); + device_set_inode_visitor devsiv(first_device_inode); + root->accept(devsiv); + mv2.devices_ref() = std::move(devsiv.device_ids()); + + log_.info() << "assigning pipe/socket inodes..."; + uint32_t first_pipe_inode = first_device_inode; + pipe_set_inode_visitor pipsiv(first_pipe_inode); + root->accept(pipsiv); + + log_.info() << "building metadata..."; + wg_.add_job([&] { log_.info() << "saving names and links..."; names_and_links_visitor nlv(ge_data); @@ -520,7 +574,7 @@ void scanner_::scan(filesystem_writer& fsw, log_.debug() << "total number of chunks: " << mv2.chunks.size(); log_.info() << "saving directories..."; - mv2.entry_index.resize(first_file_inode + im->count()); + mv2.entry_index.resize(first_pipe_inode); mv2.directories.reserve(first_link_inode + 1); save_directories_visitor sdv(first_link_inode); root->accept(sdv); diff --git a/test/dwarfs.cpp b/test/dwarfs.cpp index 6ea8a218..06613b90 100644 --- a/test/dwarfs.cpp +++ b/test/dwarfs.cpp @@ -19,8 +19,8 @@ * along with dwarfs. If not, see . */ +#include #include -#include #include @@ -67,17 +67,21 @@ struct simplestat { ::uid_t st_uid; ::gid_t st_gid; ::off_t st_size; + ::dev_t st_rdev; }; -std::unordered_map statmap{ - {"/", {S_IFDIR | 0777, 1000, 1000, 0}}, - {"//test.pl", {S_IFREG | 0644, 1000, 1000, 0}}, - {"//somelink", {S_IFLNK | 0777, 1000, 1000, 16}}, - {"//somedir", {S_IFDIR | 0777, 1000, 1000, 0}}, - {"//foo.pl", {S_IFREG | 0600, 1337, 0, 23456}}, - {"//ipsum.txt", {S_IFREG | 0644, 1000, 1000, 2000000}}, - {"//somedir/ipsum.py", {S_IFREG | 0644, 1000, 1000, 10000}}, - {"//somedir/bad", {S_IFLNK | 0777, 1000, 1000, 6}}, +std::map statmap{ + {"", {S_IFDIR | 0777, 1000, 100, 0, 0}}, + {"/test.pl", {S_IFREG | 0644, 1000, 100, 0, 0}}, + {"/somelink", {S_IFLNK | 0777, 1000, 100, 16, 0}}, + {"/somedir", {S_IFDIR | 0777, 1000, 100, 0, 0}}, + {"/foo.pl", {S_IFREG | 0600, 1337, 0, 23456, 0}}, + {"/ipsum.txt", {S_IFREG | 0644, 1000, 100, 2000000, 0}}, + {"/somedir/ipsum.py", {S_IFREG | 0644, 1000, 100, 10000, 0}}, + {"/somedir/bad", {S_IFLNK | 0777, 1000, 100, 6, 0}}, + {"/somedir/pipe", {S_IFIFO | 0644, 1000, 100, 0, 0}}, + {"/somedir/null", {S_IFCHR | 0666, 0, 0, 0, 259}}, + {"/somedir/zero", {S_IFCHR | 0666, 0, 0, 0, 261}}, }; } // namespace @@ -95,18 +99,15 @@ class mmap_mock : public mmif { class os_access_mock : public os_access { public: std::shared_ptr opendir(const std::string& path) const override { - if (path == "/") { + if (path.empty()) { std::vector files{ ".", "..", "test.pl", "somelink", "somedir", "foo.pl", "ipsum.txt", }; return std::make_shared(std::move(files)); - } else if (path == "//somedir") { + } else if (path == "/somedir") { std::vector files{ - ".", - "..", - "ipsum.py", - "bad", + ".", "..", "ipsum.py", "bad", "pipe", "null", "zero", }; return std::make_shared(std::move(files)); @@ -125,12 +126,13 @@ class os_access_mock : public os_access { st->st_atime = 123; st->st_mtime = 234; st->st_ctime = 345; + st->st_rdev = sst.st_rdev; } std::string readlink(const std::string& path, size_t size) const override { - if (path == "//somelink" && size == 16) { + if (path == "/somelink" && size == 16) { return "somedir/ipsum.py"; - } else if (path == "//somedir/bad" && size == 6) { + } else if (path == "/somedir/bad" && size == 6) { return "../foo"; } @@ -196,7 +198,7 @@ void basic_end_to_end_test(const std::string& compressor, block_compressor bc(compressor); filesystem_writer fsw(oss, lgr, wg, prog, bc, 64 << 20); - s.scan(fsw, "/", prog); + s.scan(fsw, "", prog); auto mm = std::make_shared(oss.str()); @@ -209,9 +211,10 @@ void basic_end_to_end_test(const std::string& compressor, struct ::stat st; ASSERT_TRUE(entry); - EXPECT_EQ(fs.getattr(*entry, &st), 0); EXPECT_EQ(st.st_size, 23456); + EXPECT_EQ(st.st_uid, 1337); + EXPECT_EQ(st.st_gid, 0); int inode = fs.open(*entry); EXPECT_GE(inode, 0); @@ -223,20 +226,57 @@ void basic_end_to_end_test(const std::string& compressor, entry = fs.find("/somelink"); + ASSERT_TRUE(entry); EXPECT_EQ(fs.getattr(*entry, &st), 0); EXPECT_EQ(st.st_size, 16); + EXPECT_EQ(st.st_uid, 1000); + EXPECT_EQ(st.st_gid, 100); + EXPECT_EQ(st.st_rdev, 0); std::string link; EXPECT_EQ(fs.readlink(*entry, &link), 0); EXPECT_EQ(link, "somedir/ipsum.py"); + EXPECT_FALSE(fs.find("/somedir/nope")); + entry = fs.find("/somedir/bad"); + ASSERT_TRUE(entry); EXPECT_EQ(fs.getattr(*entry, &st), 0); EXPECT_EQ(st.st_size, 6); EXPECT_EQ(fs.readlink(*entry, &link), 0); EXPECT_EQ(link, "../foo"); + + entry = fs.find("/somedir/pipe"); + + ASSERT_TRUE(entry); + EXPECT_EQ(fs.getattr(*entry, &st), 0); + EXPECT_EQ(st.st_size, 0); + EXPECT_EQ(st.st_uid, 1000); + EXPECT_EQ(st.st_gid, 100); + EXPECT_TRUE(S_ISFIFO(st.st_mode)); + EXPECT_EQ(st.st_rdev, 0); + + entry = fs.find("/somedir/null"); + + ASSERT_TRUE(entry); + EXPECT_EQ(fs.getattr(*entry, &st), 0); + EXPECT_EQ(st.st_size, 0); + EXPECT_EQ(st.st_uid, 0); + EXPECT_EQ(st.st_gid, 0); + EXPECT_TRUE(S_ISCHR(st.st_mode)); + EXPECT_EQ(st.st_rdev, 259); + + entry = fs.find("/somedir/zero"); + + ASSERT_TRUE(entry); + EXPECT_EQ(fs.getattr(*entry, &st), 0); + EXPECT_EQ(st.st_size, 0); + EXPECT_EQ(st.st_uid, 0); + EXPECT_EQ(st.st_gid, 0); + EXPECT_TRUE(S_ISCHR(st.st_mode)); + EXPECT_EQ(st.st_rdev, 261); } std::vector const compressions{"null", diff --git a/thrift/metadata.thrift b/thrift/metadata.thrift index 54fd121e..8f485a4f 100644 --- a/thrift/metadata.thrift +++ b/thrift/metadata.thrift @@ -104,7 +104,11 @@ struct metadata { */ 2: required list directories, - // all entries, can be looked up by inode through entry_index + /** + * All entries, can be looked up by inode through entry_index, or by + * directory through `first_entry`, where the entries will be between + * `directories[n].first_entry` and `directories[n+1].first_entry`. + */ 3: required list entries, /** @@ -114,7 +118,17 @@ struct metadata { */ 4: required list chunk_index, - // entry index, indexed by inode + /** + * Entry index, indexed by inode + * + * This list contains all inodes strictly in the following order: + * + * - directories, starting with the root dir at inode 0 + * - symbolic links + * - regular files + * - character and block devices + * - named pipes and sockets + */ 5: required list entry_index, // link index, indexed by (inode - link_index_offset) @@ -154,4 +168,7 @@ struct metadata { // total file system size 16: required UInt64 total_fs_size, + + // device ids, for lookup by (inode - device_index_offset) + 17: optional list devices, }