metadata_v2: v1 removal, part 1

This commit is contained in:
Marcus Holland-Moritz 2020-11-28 00:54:11 +01:00
parent 0edd99303a
commit e3a4f8db09
5 changed files with 65 additions and 342 deletions

View File

@ -32,14 +32,7 @@
#include <fuse3/fuse_lowlevel.h>
#define USE_META_V2
#ifdef USE_META_V2
#include "dwarfs/filesystem_v2.h"
#else
#include "dwarfs/filesystem.h"
#endif
#include "dwarfs/metadata_v2.h"
#include "dwarfs/mmap.h"
#include "dwarfs/options.h"
@ -78,16 +71,9 @@ const struct fuse_opt dwarfs_opts[] = {
DWARFS_OPT("workers=%s", workers_str, 0),
DWARFS_OPT("decratio=%s", decompress_ratio_str, 0), FUSE_OPT_END};
#ifdef USE_META_V2
using filesystem = filesystem_v2;
#define ENTRY_V2(e) (*(e))
#else
#define ENTRY_V2(e) (e)
#endif
options opts;
stream_logger s_lgr(std::cerr);
std::shared_ptr<filesystem> s_fs;
std::shared_ptr<filesystem_v2> s_fs;
void op_init(void* /*userdata*/, struct fuse_conn_info* /*conn*/) {
DEBUG_FUNC("")
@ -96,7 +82,7 @@ void op_init(void* /*userdata*/, struct fuse_conn_info* /*conn*/) {
bco.num_workers = opts.workers;
bco.decompress_ratio = opts.decompress_ratio;
s_fs =
std::make_shared<filesystem>(s_lgr, std::make_shared<mmap>(opts.fsimage),
std::make_shared<filesystem_v2>(s_lgr, std::make_shared<mmap>(opts.fsimage),
bco, &opts.stat_defaults, FUSE_ROOT_ID);
}
@ -111,12 +97,12 @@ void op_lookup(fuse_req_t req, fuse_ino_t parent, const char* name) {
int err = ENOENT;
try {
auto de = s_fs->find(parent, name);
auto entry = s_fs->find(parent, name);
if (de) {
if (entry) {
struct ::fuse_entry_param e;
err = s_fs->getattr(ENTRY_V2(de), &e.attr);
err = s_fs->getattr(*entry, &e.attr);
if (err == 0) {
e.generation = 1;
@ -147,12 +133,12 @@ void op_getattr(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info*) {
// TODO: merge with op_lookup
try {
auto de = s_fs->find(ino);
auto entry = s_fs->find(ino);
if (de) {
if (entry) {
struct ::stat stbuf;
err = s_fs->getattr(ENTRY_V2(de), &stbuf);
err = s_fs->getattr(*entry, &stbuf);
if (err == 0) {
fuse_reply_attr(req, &stbuf, std::numeric_limits<double>::max());
@ -178,11 +164,11 @@ void op_access(fuse_req_t req, fuse_ino_t ino, int mode) {
// TODO: merge with op_lookup
try {
auto de = s_fs->find(ino);
auto entry = s_fs->find(ino);
if (de) {
if (entry) {
auto ctx = fuse_req_ctx(req);
err = s_fs->access(ENTRY_V2(de), mode, ctx->uid, ctx->gid);
err = s_fs->access(*entry, mode, ctx->uid, ctx->gid);
}
} catch (const dwarfs::error& e) {
std::cerr << "ERROR: " << e.what() << std::endl;
@ -201,12 +187,12 @@ void op_readlink(fuse_req_t req, fuse_ino_t ino) {
int err = ENOENT;
try {
auto de = s_fs->find(ino);
auto entry = s_fs->find(ino);
if (de) {
if (entry) {
std::string str;
err = s_fs->readlink(ENTRY_V2(de), &str);
err = s_fs->readlink(*entry, &str);
if (err == 0) {
fuse_reply_readlink(req, str.c_str());
@ -231,23 +217,15 @@ void op_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info* fi) {
int err = ENOENT;
try {
auto de = s_fs->find(ino);
auto entry = s_fs->find(ino);
if (de) {
#ifdef USE_META_V2
if (S_ISDIR(de->mode())) {
#else
if (S_ISDIR(de->mode)) {
#endif
if (entry) {
if (S_ISDIR(entry->mode())) {
err = EISDIR;
} else if (fi->flags & (O_APPEND | O_CREAT | O_TRUNC)) {
err = EACCES;
} else {
#ifdef USE_META_V2
fi->fh = FUSE_ROOT_ID + de->inode();
#else
fi->fh = reinterpret_cast<intptr_t>(de);
#endif
fi->fh = FUSE_ROOT_ID + entry->inode();
fi->keep_cache = 1;
fuse_reply_open(req, fi);
return;
@ -271,13 +249,7 @@ void op_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off,
int err = ENOENT;
try {
#ifdef USE_META_V2
assert(fi->fh == ino);
#else
auto de = reinterpret_cast<const dir_entry*>(fi->fh);
if (de) {
#endif
iovec_read_buf buf;
ssize_t rv = s_fs->readv(ino, buf, size, off);
@ -291,9 +263,6 @@ void op_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off,
}
err = -rv;
#ifndef USE_META_V2
}
#endif
}
catch (const dwarfs::error& e) {
std::cerr << "ERROR: " << e.what() << std::endl;
@ -314,33 +283,25 @@ void op_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off,
int err = ENOENT;
try {
auto de = s_fs->find(ino);
auto dirent = s_fs->find(ino);
if (de) {
auto d = s_fs->opendir(ENTRY_V2(de));
if (dirent) {
auto dir = s_fs->opendir(*dirent);
if (d) {
off_t lastoff = s_fs->dirsize(ENTRY_V2(d));
#ifndef USE_META_V2
std::string name;
#endif
if (dir) {
off_t lastoff = s_fs->dirsize(*dir);
struct stat stbuf;
std::vector<char> buf(size);
size_t written = 0;
while (off < lastoff) {
#ifdef USE_META_V2
auto res = s_fs->readdir(*d, off);
auto res = s_fs->readdir(*dir, off);
assert(res);
auto [de2, name_view] = *res;
std::string name(name_view);
#else
auto de2 = s_fs->readdir(d, off, &name);
#endif
s_fs->getattr(de2, &stbuf);
/// std::cerr << ">>> " << off << "/" << lastoff << " - " << name << "
/// - " << stbuf.st_ino << std::endl;
auto [entry, name_view] = *res;
std::string name(name_view);
s_fs->getattr(entry, &stbuf);
size_t needed =
fuse_add_direntry(req, &buf[written], buf.size() - written,

View File

@ -19,37 +19,22 @@
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/
#include <atomic>
#include <cstring>
#include <deque>
#include <functional>
#include <iostream>
#include <map>
#include <mutex>
#include <set>
#include <string>
#include <thread>
#include <unordered_map>
#include <vector>
#include <boost/system/system_error.hpp>
#include <folly/Conv.h>
#include <folly/String.h>
#include <folly/small_vector.h>
#include <sparsehash/dense_hash_map>
#include <sparsehash/dense_hash_set>
#include "dwarfs/config.h"
#include "dwarfs/cyclic_hash.h"
#include "dwarfs/entry.h"
#include "dwarfs/filesystem_writer.h"
#include "dwarfs/fstypes.h"
#include "dwarfs/hash_util.h"
#include "dwarfs/inode_manager.h"
#include "dwarfs/logger.h"
#include "dwarfs/metadata.h"
#include "dwarfs/metadata_v2.h"
#include "dwarfs/metadata_writer.h"
#include "dwarfs/options.h"
@ -59,12 +44,6 @@
#include "dwarfs/script.h"
#include "dwarfs/util.h"
#include "dwarfs/gen-cpp2/metadata_layouts.h"
#include "dwarfs/gen-cpp2/metadata_types.h"
#include "dwarfs/gen-cpp2/metadata_types_custom_protocol.h"
#include <thrift/lib/cpp2/protocol/DebugProtocol.h>
#include <thrift/lib/thrift/gen-cpp2/frozen_types_custom_protocol.h>
namespace dwarfs {
template <typename LoggerPolicy>
@ -77,28 +56,6 @@ class scanner_ : public scanner::impl {
void scan(filesystem_writer& fsw, const std::string& path, progress& prog);
private:
template <typename Key, typename Value, typename HashKey = std::hash<Key>>
class fast_hash_map : public google::dense_hash_map<Key, Value, HashKey> {
public:
fast_hash_map() { this->set_empty_key(Key()); }
};
template <typename T, typename HashT = std::hash<T>>
class fast_hash_set : public google::dense_hash_set<T, HashT> {
public:
fast_hash_set() { this->set_empty_key(T()); }
};
// We want these to be ordered
// TODO: StringPiece?
// TODO: Use dense/unordered maps/sets and sort later?
using file_name_table_t =
fast_hash_map<size_t, fast_hash_set<std::string_view, folly::Hash>>;
std::unordered_map<std::string_view, size_t, folly::Hash>
compress_names_table(metadata_writer& mw,
const file_name_table_t& file_name) const;
const block_manager::config& cfg_;
const scanner_options& options_;
std::shared_ptr<entry_factory> entry_;
@ -125,94 +82,6 @@ scanner_<LoggerPolicy>::scanner_(logger& lgr, worker_group& wg,
, lgr_(lgr)
, log_(lgr) {}
template <typename LoggerPolicy>
std::unordered_map<std::string_view, size_t, folly::Hash>
scanner_<LoggerPolicy>::compress_names_table(
metadata_writer& mw, const file_name_table_t& file_name) const {
log_.info() << "compressing names table...";
auto ti = log_.timed_info();
google::dense_hash_map<uint32_t, uint32_t> index;
using position_vector = folly::small_vector<uint32_t, 4>;
std::vector<position_vector> positions;
index.set_empty_key(0);
uint32_t index_pos = 0;
std::unordered_map<std::string_view, size_t, folly::Hash> offset;
size_t saved = 0;
size_t orig_offset = mw.offset();
std::vector<size_t> sizes(file_name.size());
std::transform(file_name.begin(), file_name.end(), sizes.begin(),
[](const auto& p) { return p.first; });
std::sort(sizes.begin(), sizes.end(), std::greater<size_t>());
for (auto size : sizes) {
auto nsi = file_name.find(size);
assert(nsi != file_name.end());
std::vector<std::string_view> names(nsi->second.size());
std::copy(nsi->second.begin(), nsi->second.end(), names.begin());
std::sort(names.begin(), names.end());
for (auto k : names) {
bool found = false;
if (!index.empty() && k.size() >= sizeof(uint32_t)) {
uint32_t key;
std::memcpy(&key, k.data(), sizeof(key));
auto it = index.find(key);
if (it != index.end()) {
for (uint32_t pos : positions[it->second]) {
if (std::memcmp(mw.section_data() + pos + sizeof(key),
k.data() + sizeof(key),
k.size() - sizeof(key)) == 0) {
offset[k] = mw.section_data_offset() + pos;
saved += k.size();
found = true;
break;
}
}
}
} else {
auto it = std::search(mw.section_begin(), mw.end(), k.begin(), k.end());
if (it != mw.end()) {
offset[k] = it - mw.begin();
saved += k.size();
found = true;
}
}
if (!found) {
offset[k] = mw.offset();
mw.write(k);
if (mw.section_data_size() >= sizeof(uint32_t)) {
uint32_t last = mw.section_data_size() - sizeof(uint32_t);
while (index_pos <= last) {
uint32_t key;
std::memcpy(&key, mw.section_data() + index_pos, sizeof(key));
auto r = index.insert(std::make_pair(key, positions.size()));
uint32_t pos_index;
if (r.second) {
pos_index = positions.size();
positions.resize(pos_index + 1);
} else {
pos_index = r.first->second;
}
positions[pos_index].push_back(index_pos++);
}
}
}
}
}
ti << "names table: " << size_with_unit(mw.offset() - orig_offset) << " ("
<< size_with_unit(saved) << " saved)";
return offset;
}
class dir_set_inode_visitor : public entry_visitor {
public:
dir_set_inode_visitor(uint32_t& inode_no)
@ -250,24 +119,14 @@ class link_set_inode_visitor : public entry_visitor {
class names_and_links_visitor : public entry_visitor {
public:
names_and_links_visitor(metadata_writer& mw, global_entry_data& data)
: mw_(mw)
, data_(data) {}
names_and_links_visitor(global_entry_data& data)
: data_(data) {}
void visit(file* p) override { data_.add_name(p->name()); }
void visit(link* p) override {
data_.add_name(p->name());
data_.add_link(p->linkname());
const auto& name = p->linkname();
auto r = offset_.emplace(name, mw_.offset());
if (r.second) {
uint16_t len = folly::to<uint16_t>(name.size());
mw_.write(len);
mw_.write(name);
}
p->set_offset(r.first->second);
}
void visit(dir* p) override {
@ -277,24 +136,17 @@ class names_and_links_visitor : public entry_visitor {
}
private:
metadata_writer& mw_;
global_entry_data& data_;
std::unordered_map<std::string_view, size_t, folly::Hash> offset_;
};
class save_directories_visitor : public entry_visitor {
public:
save_directories_visitor(metadata_writer& mw, thrift::metadata::metadata& mv2,
save_directories_visitor(thrift::metadata::metadata& mv2,
global_entry_data const& ge_data,
std::vector<uint32_t>& dir_index,
std::vector<uint32_t>& index)
: mw_(mw)
, mv2_(mv2)
std::vector<uint32_t>& dir_index)
: mv2_(mv2)
, ge_data_(ge_data)
, dir_index_(dir_index)
, cb_([&](const entry* e, size_t offset) {
index.at(e->inode_num()) = folly::to<uint32_t>(offset);
}) {}
, dir_index_(dir_index) {}
void visit(file*) override {
// nothing
@ -308,22 +160,15 @@ class save_directories_visitor : public entry_visitor {
dir_index_.at(p->inode_num()) = mv2_.directories.size();
p->pack(mv2_, ge_data_);
p->set_offset(mw_.offset());
p->pack(mw_.buffer(p->packed_size()), cb_);
if (!p->has_parent()) {
cb_(p, mw_.offset());
p->pack_entry(mw_.buffer(p->packed_entry_size()));
p->pack_entry(mv2_, ge_data_);
}
}
private:
metadata_writer& mw_;
thrift::metadata::metadata& mv2_;
global_entry_data const& ge_data_;
std::vector<uint32_t>& dir_index_;
std::function<void(const entry* e, size_t offset)> cb_;
};
template <typename LoggerPolicy>
@ -436,20 +281,14 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
log_.info() << "waiting for background scanners...";
wg_.wait();
size_t total{0};
std::unordered_map<std::string_view, std::vector<file*>, folly::Hash>
file_hash;
file_name_table_t file_name;
// TODO: turn into visitor?
root->walk([&](entry* ep) {
if (auto fp = dynamic_cast<file*>(ep)) {
file_hash[fp->hash()].push_back(fp);
}
if (ep->has_parent()) {
const std::string& name = ep->name();
file_name[name.size()].insert(name);
total += name.size();
}
});
log_.info() << "finding duplicate files...";
@ -522,31 +361,22 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
im->number_inodes(first_file_inode);
log_.info() << "building metadata...";
std::vector<uint8_t> metadata_vec;
metadata_writer mw(lgr_, metadata_vec);
global_entry_data ge_data(
options_.no_time); // TODO: just pass options directly
thrift::metadata::metadata mv2;
std::vector<uint32_t> dir_index;
dir_index.resize(first_link_inode);
mv2.link_index.resize(first_file_inode - first_link_inode);
wg_.add_job([&] {
mw.start_section(section_type::META_TABLEDATA);
log_.info() << "saving links...";
names_and_links_visitor nlv(mw, ge_data);
names_and_links_visitor nlv(ge_data);
root->accept(nlv);
ge_data.index();
log_.debug() << "link data size = " << mw.section_data_size();
log_.info() << "saving names...";
auto name_offset = compress_names_table(mw, file_name);
log_.debug() << "name data size = " << mw.section_data_size();
log_.info() << "updating name offsets...";
root->walk([&](entry* ep) {
ep->update(ge_data);
@ -554,13 +384,6 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
mv2.link_index.at(ep->inode_num() - first_link_inode) =
ge_data.get_link_index(lp->linkname());
}
if (ep->has_parent()) {
auto i = name_offset.find(ep->name());
if (i == name_offset.end()) {
throw std::runtime_error("offset not found for entry name");
}
ep->set_name_offset(i->second);
}
});
});
@ -593,74 +416,32 @@ void scanner_<LoggerPolicy>::scan(filesystem_writer& fsw,
log_.debug() << "saved by segmenting: "
<< size_with_unit(prog.saved_by_segmentation);
// mv2.string_table = std::string(
// reinterpret_cast<char const*>(mw.section_data()),
// mw.section_data_size());
// TODO: not sure that's actually needed
// this is actually needed
root->set_name(std::string());
log_.info() << "saving chunks...";
std::vector<uint32_t> index;
index.resize(im->count() + 1);
mv2.chunk_index.resize(im->count() + 1);
// TODO: we should be able to start this once all blocks have been
// submitted for compression
mw.align(im->chunk_size());
im->for_each_inode([&](std::shared_ptr<inode> const& ino) {
index.at(ino->num() - first_file_inode) = folly::to<uint32_t>(mw.offset());
mv2.chunk_index.at(ino->num() - first_file_inode) = mv2.chunks.size();
mw.write(ino->chunks());
ino->append_chunks(mv2.chunks);
});
// insert dummy inode to help determine number of chunks per inode
index.at(im->count()) = folly::to<uint32_t>(mw.offset());
mv2.chunk_index.at(im->count()) = mv2.chunks.size();
mw.finish_section();
size_t num_chunks = (index.back() - index.front()) / sizeof(chunk_type);
log_.debug() << "total number of file inodes: " << im->count();
log_.debug() << "total number of chunks: " << num_chunks;
log_.info() << "saving chunk index...";
mw.start_section(section_type::META_CHUNK_INDEX);
mw.write(index);
mw.finish_section();
log_.debug() << "total number of chunks: " << mv2.chunks.size();
log_.info() << "saving directories...";
index.resize(first_file_inode + im->count());
mv2.entry_index.resize(first_file_inode + im->count());
mw.start_section(section_type::META_DIRECTORIES);
save_directories_visitor sdv(mw, mv2, ge_data, dir_index, index);
save_directories_visitor sdv(mv2, ge_data, dir_index);
root->accept(sdv);
mw.finish_section();
log_.info() << "saving inode index...";
mw.start_section(section_type::META_INODE_INDEX);
mw.write(index);
mw.finish_section();
log_.info() << "saving metadata config...";
mw.start_section(section_type::META_CONFIG);
meta_config mconf;
mconf.block_size_bits = folly::to<uint8_t>(im->block_size_bits());
mconf.de_type = entry_->de_type();
mconf.unused = 0;
mconf.inode_count = first_file_inode + im->count();
mconf.orig_fs_size = prog.original_size;
mconf.chunk_index_offset = first_file_inode;
mconf.inode_index_offset = 0;
mw.write(mconf);
mw.finish_section();
// TODO: remove all metadata v1 code
// fsw.write_metadata(std::move(metadata_vec));
{
// order directories by inode number
std::vector<thrift::metadata::directory> tmp = std::move(mv2.directories);
mv2.directories.reserve(tmp.size());
for (auto i : dir_index) {

View File

@ -22,7 +22,6 @@
#include <iostream>
#include <vector>
#include "dwarfs/filesystem.h"
#include "dwarfs/filesystem_v2.h"
#include "dwarfs/mmap.h"
#include "dwarfs/options.h"
@ -32,26 +31,22 @@ int main(int argc, char** argv) {
try {
dwarfs::stream_logger lgr(std::cerr, dwarfs::logger::DEBUG);
auto mm = std::make_shared<dwarfs::mmap>(argv[1]);
dwarfs::filesystem fs(lgr, mm, dwarfs::block_cache_options());
dwarfs::filesystem_v2 fs_v2(lgr, mm, dwarfs::block_cache_options());
dwarfs::filesystem_v2 fs(lgr, mm, dwarfs::block_cache_options());
if (argc == 3) {
auto de = fs.find(argv[2]);
auto entry = fs.find(argv[2]);
if (de) {
if (entry) {
struct ::stat stbuf;
fs.getattr(de, &stbuf);
fs.getattr(*entry, &stbuf);
std::vector<char> data(stbuf.st_size);
fs.read(stbuf.st_ino, &data[0], data.size(), 0);
std::cout.write(&data[0], data.size());
}
} else {
// TODO: add more usage options...
dwarfs::filesystem::identify(lgr, mm, std::cout);
fs.dump(std::cout);
dwarfs::filesystem_v2::identify(lgr, mm, std::cout);
fs_v2.dump(std::cout);
fs.dump(std::cout);
}
} catch (const std::exception& e) {
std::cerr << "Error: " << e.what() << std::endl;

View File

@ -47,7 +47,7 @@
#include "dwarfs/block_manager.h"
#include "dwarfs/console_writer.h"
#include "dwarfs/entry.h"
#include "dwarfs/filesystem.h"
#include "dwarfs/filesystem_v2.h"
#include "dwarfs/filesystem_writer.h"
#include "dwarfs/logger.h"
#include "dwarfs/lua_script.h"
@ -458,7 +458,7 @@ int mkdwarfs(int argc, char** argv) {
if (recompress) {
auto ti = log.timed_info();
filesystem::rewrite(lgr, prog, std::make_shared<dwarfs::mmap>(path), fsw);
filesystem_v2::rewrite(lgr, prog, std::make_shared<dwarfs::mmap>(path), fsw);
wg_writer.wait();
ti << "filesystem rewritten";
} else {

View File

@ -26,7 +26,6 @@
#include "dwarfs/block_compressor.h"
#include "dwarfs/entry.h"
#include "dwarfs/filesystem.h"
#include "dwarfs/filesystem_v2.h"
#include "dwarfs/filesystem_writer.h"
#include "dwarfs/logger.h"
@ -163,31 +162,6 @@ using namespace dwarfs;
namespace {
dir_entry const* get_entry(dir_entry const* de) { return de; }
entry_view get_entry(std::optional<entry_view> entry) { return *entry; }
template <typename T>
void test_created_filesystem(T const& fs) {
auto de = fs.find("/foo.pl");
struct ::stat st;
ASSERT_TRUE(de);
auto entry = get_entry(de);
EXPECT_EQ(fs.getattr(entry, &st), 0);
EXPECT_EQ(st.st_size, 23456);
int inode = fs.open(entry);
EXPECT_GE(inode, 0);
std::vector<char> buf(st.st_size);
ssize_t rv = fs.read(inode, &buf[0], st.st_size, 0);
EXPECT_EQ(rv, st.st_size);
EXPECT_EQ(std::string(buf.begin(), buf.end()), test::loremipsum(st.st_size));
}
void basic_end_to_end_test(const std::string& compressor,
unsigned block_size_bits, file_order_mode file_order,
bool no_owner, bool no_time) {
@ -225,11 +199,23 @@ void basic_end_to_end_test(const std::string& compressor,
block_cache_options bco;
bco.max_bytes = 1 << 20;
filesystem fs(lgr, mm, bco);
test_created_filesystem(fs);
filesystem_v2 fs(lgr, mm, bco);
filesystem_v2 fs_v2(lgr, mm, bco);
test_created_filesystem(fs_v2);
auto entry = fs.find("/foo.pl");
struct ::stat st;
ASSERT_TRUE(entry);
EXPECT_EQ(fs.getattr(*entry, &st), 0);
EXPECT_EQ(st.st_size, 23456);
int inode = fs.open(*entry);
EXPECT_GE(inode, 0);
std::vector<char> buf(st.st_size);
ssize_t rv = fs.read(inode, &buf[0], st.st_size, 0);
EXPECT_EQ(rv, st.st_size);
EXPECT_EQ(std::string(buf.begin(), buf.end()), test::loremipsum(st.st_size));
}
std::vector<std::string> const compressions{"null",