feat(dwarfsck): add --list, --checksum, --verbose (fixes gh #192)

This commit is contained in:
Marcus Holland-Moritz 2024-02-07 17:53:25 +01:00
parent d6bd917fae
commit 8831009a52
3 changed files with 243 additions and 4 deletions

View File

@ -25,6 +25,9 @@ with a non-zero exit code.
- `-q`, `--quiet`: - `-q`, `--quiet`:
Don't produce any output unless there is an error. Don't produce any output unless there is an error.
- `-v`, `--verbose`:
Produce verbose output, where applicable.
- `-O`, `--image-offset=`*value*|`auto`: - `-O`, `--image-offset=`*value*|`auto`:
Specify the byte offset at which the filesystem is located in the image. Specify the byte offset at which the filesystem is located in the image.
Use `auto` to detect the offset automatically. This is also the default. Use `auto` to detect the offset automatically. This is also the default.
@ -36,6 +39,19 @@ with a non-zero exit code.
header is present, the program will exit with exit code 2 and emit a header is present, the program will exit with exit code 2 and emit a
warning. warning.
- `-l`, `--list`:
List all entries in the file system image. Uses output similar to `tar -t`.
With `--verbose`, also print details about each entry.
- `--checksum=`*name*:
Produce a checksum using the specified algorithm for each regular file in
the file system image. This can be used to easily verify the file system
image against local files, e.g.:
```
dwarfsck --checksum=sha512 /tmp/fs.dwarfs | sha512sum --check
```
- `-n`, `--num-workers=`*value*: - `-n`, `--num-workers=`*value*:
Number of worker threads used for integrity checking. Number of worker threads used for integrity checking.

View File

@ -19,18 +19,25 @@
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>. * along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/ */
#include <algorithm>
#include <cstring> #include <cstring>
#include <iostream> #include <iostream>
#include <mutex>
#include <string_view> #include <string_view>
#include <vector> #include <vector>
#include <boost/program_options.hpp> #include <boost/program_options.hpp>
#include <fmt/chrono.h>
#include <fmt/format.h>
#include <folly/String.h> #include <folly/String.h>
#include <folly/gen/String.h>
#include <folly/json.h> #include <folly/json.h>
#include <folly/portability/Unistd.h> #include <folly/portability/Unistd.h>
#include <folly/system/HardwareConcurrency.h> #include <folly/system/HardwareConcurrency.h>
#include "dwarfs/checksum.h"
#include "dwarfs/error.h" #include "dwarfs/error.h"
#include "dwarfs/file_access.h" #include "dwarfs/file_access.h"
#include "dwarfs/filesystem_v2.h" #include "dwarfs/filesystem_v2.h"
@ -41,24 +48,131 @@
#include "dwarfs/os_access.h" #include "dwarfs/os_access.h"
#include "dwarfs/tool.h" #include "dwarfs/tool.h"
#include "dwarfs/util.h" #include "dwarfs/util.h"
#include "dwarfs/worker_group.h"
#include "dwarfs_tool_main.h" #include "dwarfs_tool_main.h"
namespace dwarfs { namespace dwarfs {
namespace po = boost::program_options; namespace po = boost::program_options;
namespace {
void do_list_files(filesystem_v2& fs, iolayer const& iol, bool verbose) {
auto max_width = [](auto const& vec) {
auto max = std::max_element(vec.begin(), vec.end());
return std::to_string(*max).size();
};
auto const uid_width = max_width(fs.get_all_uids());
auto const gid_width = max_width(fs.get_all_gids());
file_stat::off_type max_inode_size{0};
fs.walk([&](auto const& de) {
file_stat st;
fs.getattr(de.inode(), &st);
max_inode_size = std::max(max_inode_size, st.size);
});
auto const inode_size_width = fmt::format("{:L}", max_inode_size).size();
fs.walk([&](auto const& de) {
auto iv = de.inode();
file_stat st;
fs.getattr(iv, &st);
auto name = de.unix_path();
utf8_sanitize(name);
if (verbose) {
if (iv.is_symlink()) {
auto target = fs.readlink(iv).value();
utf8_sanitize(target);
name += " -> " + target;
}
iol.out << fmt::format(
"{3} {4:{0}}/{5:{1}} {6:{2}L} {7:%Y-%m-%d %H:%M} {8}\n", uid_width,
gid_width, inode_size_width, iv.mode_string(), iv.getuid(),
iv.getgid(), st.size, fmt::localtime(st.mtime), name);
} else if (!name.empty()) {
iol.out << name << "\n";
}
});
}
void do_checksum(logger& lgr, filesystem_v2& fs, iolayer const& iol,
std::string const& algo, size_t num_workers) {
LOG_PROXY(debug_logger_policy, lgr);
worker_group wg{lgr, *iol.os, "checksum", num_workers};
std::mutex mx;
fs.walk_data_order([&](auto const& de) {
auto iv = de.inode();
if (iv.is_regular_file()) {
wg.add_job([&, de, iv] {
file_stat st;
if (fs.getattr(de.inode(), &st) != 0) {
LOG_ERROR << "failed to get attributes for inode " << iv.inode_num();
return;
}
auto ranges = fs.readv(iv.inode_num(), st.size);
if (!ranges) {
LOG_ERROR << "failed to read inode " << iv.inode_num() << ": "
<< std::strerror(-ranges.error());
return;
}
checksum cs(algo);
for (auto& fut : ranges.value()) {
try {
auto range = fut.get();
cs.update(range.data(), range.size());
} catch (std::exception const& e) {
LOG_ERROR << "error reading data from inode " << iv.inode_num()
<< ": " << e.what();
return;
}
}
auto output = fmt::format("{} {}\n", cs.hexdigest(), de.unix_path());
{
std::lock_guard lock(mx);
iol.out << output;
}
});
}
});
wg.wait();
}
} // namespace
int dwarfsck_main(int argc, sys_char** argv, iolayer const& iol) { int dwarfsck_main(int argc, sys_char** argv, iolayer const& iol) {
using namespace folly::gen;
const size_t num_cpu = std::max(folly::hardware_concurrency(), 1u); const size_t num_cpu = std::max(folly::hardware_concurrency(), 1u);
std::string input, export_metadata, image_offset; auto algo_list = checksum::available_algorithms();
auto checksum_desc = "print checksums for all files (" +
(from(algo_list) | unsplit(", ")) + ")";
std::string input, export_metadata, image_offset, checksum_algo;
logger_options logopts; logger_options logopts;
size_t num_workers; size_t num_workers;
int detail; int detail;
bool quiet{false}; bool quiet{false};
bool verbose{false};
bool output_json{false}; bool output_json{false};
bool check_integrity{false}; bool check_integrity{false};
bool no_check{false}; bool no_check{false};
bool print_header{false}; bool print_header{false};
bool list_files{false};
// clang-format off // clang-format off
po::options_description opts("Command line options"); po::options_description opts("Command line options");
@ -72,12 +186,21 @@ int dwarfsck_main(int argc, sys_char** argv, iolayer const& iol) {
("quiet,q", ("quiet,q",
po::value<bool>(&quiet)->zero_tokens(), po::value<bool>(&quiet)->zero_tokens(),
"don't print anything unless an error occurs") "don't print anything unless an error occurs")
("verbose,v",
po::value<bool>(&verbose)->zero_tokens(),
"produce verbose output")
("image-offset,O", ("image-offset,O",
po::value<std::string>(&image_offset)->default_value("auto"), po::value<std::string>(&image_offset)->default_value("auto"),
"filesystem image offset in bytes") "filesystem image offset in bytes")
("print-header,H", ("print-header,H",
po::value<bool>(&print_header)->zero_tokens(), po::value<bool>(&print_header)->zero_tokens(),
"print filesystem header to stdout and exit") "print filesystem header to stdout and exit")
("list,l",
po::value<bool>(&list_files)->zero_tokens(),
"list all files and exit")
("checksum",
po::value<std::string>(&checksum_algo),
checksum_desc.c_str())
("num-workers,n", ("num-workers,n",
po::value<size_t>(&num_workers)->default_value(num_cpu), po::value<size_t>(&num_workers)->default_value(num_cpu),
"number of reader worker threads") "number of reader worker threads")
@ -138,10 +261,16 @@ int dwarfsck_main(int argc, sys_char** argv, iolayer const& iol) {
return 1; return 1;
} }
if (vm.count("checksum") && !checksum::is_available(checksum_algo)) {
LOG_WARN << "checksum algorithm not available: " << checksum_algo;
return 1;
}
if (print_header && if (print_header &&
(output_json || !export_metadata.empty() || check_integrity)) { (output_json || !export_metadata.empty() || check_integrity ||
list_files || !checksum_algo.empty())) {
LOG_WARN << "--print-header is mutually exclusive with --json, " LOG_WARN << "--print-header is mutually exclusive with --json, "
"--export-metadata and --check-integrity"; "--export-metadata, --check-integrity, --list and --checksum";
return 1; return 1;
} }
@ -191,7 +320,7 @@ int dwarfsck_main(int argc, sys_char** argv, iolayer const& iol) {
: filesystem_check_level::CHECKSUM; : filesystem_check_level::CHECKSUM;
auto errors = no_check ? 0 : fs.check(level, num_workers); auto errors = no_check ? 0 : fs.check(level, num_workers);
if (!quiet) { if (!quiet && !list_files && checksum_algo.empty()) {
if (output_json) { if (output_json) {
iol.out << folly::toPrettyJson(fs.info_as_dynamic(detail)) << "\n"; iol.out << folly::toPrettyJson(fs.info_as_dynamic(detail)) << "\n";
} else { } else {
@ -199,6 +328,14 @@ int dwarfsck_main(int argc, sys_char** argv, iolayer const& iol) {
} }
} }
if (list_files) {
do_list_files(fs, iol, verbose);
}
if (!checksum_algo.empty()) {
do_checksum(lgr, fs, iol, checksum_algo, num_workers);
}
if (errors > 0) { if (errors > 0) {
return 1; return 1;
} }

View File

@ -31,6 +31,7 @@
#include <gmock/gmock.h> #include <gmock/gmock.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <fmt/chrono.h>
#include <fmt/format.h> #include <fmt/format.h>
#include <folly/FileUtil.h> #include <folly/FileUtil.h>
@ -2054,6 +2055,91 @@ TEST(dwarfsck_test, export_metadata_close_error) {
::testing::HasSubstr("failed to close metadata output file")); ::testing::HasSubstr("failed to close metadata output file"));
} }
TEST(dwarfsck_test, checksum_algorithm_not_available) {
auto t = dwarfsck_tester::create_with_image();
EXPECT_NE(0, t.run({"image.dwarfs", "--checksum=grmpf"})) << t.err();
EXPECT_THAT(t.err(),
::testing::HasSubstr("checksum algorithm not available: grmpf"));
}
TEST(dwarfsck_test, list_files) {
auto t = dwarfsck_tester::create_with_image();
EXPECT_EQ(0, t.run({"image.dwarfs", "--list"})) << t.err();
auto out = t.out();
std::set<std::string> files;
folly::splitTo<std::string>('\n', out, std::inserter(files, files.end()),
true);
std::set<std::string> const expected{
"test.pl", "somelink", "somedir", "foo.pl",
"bar.pl", "baz.pl", "ipsum.txt", "somedir/ipsum.py",
"somedir/bad", "somedir/empty", "empty",
};
EXPECT_EQ(expected, files);
}
TEST(dwarfsck_test, list_files_verbose) {
auto t = dwarfsck_tester::create_with_image();
EXPECT_EQ(0, t.run({"image.dwarfs", "--list", "--verbose"})) << t.err();
auto out = t.out();
auto num_lines = std::count(out.begin(), out.end(), '\n');
EXPECT_EQ(12, num_lines);
std::vector<std::string> expected_re{
fmt::format("drwxrwxrwx\\s+1000/100\\s+8\\s+{:%Y-%m-%d %H:%M}\\s*\n",
fmt::localtime(2)),
fmt::format(
"-rw-------\\s+1337/ 0\\s+{:L}\\s+{:%Y-%m-%d %H:%M}\\s+baz.pl\n",
23456, fmt::localtime(8002)),
fmt::format("lrwxrwxrwx\\s+1000/100\\s+16\\s+{:%Y-%m-%d "
"%H:%M}\\s+somelink -> somedir/ipsum.py\n",
fmt::localtime(2002)),
};
for (auto const& str : expected_re) {
std::regex re{str};
EXPECT_TRUE(std::regex_search(out, re)) << "[" << str << "]\n" << out;
}
}
TEST(dwarfsck_test, checksum_files) {
auto t = dwarfsck_tester::create_with_image();
EXPECT_EQ(0, t.run({"image.dwarfs", "--checksum=md5"})) << t.err();
auto out = t.out();
auto num_lines = std::count(out.begin(), out.end(), '\n');
EXPECT_EQ(8, num_lines);
std::map<std::string, std::string> actual;
std::vector<std::string_view> lines;
folly::split('\n', out, lines);
for (auto const& line : lines) {
if (line.empty()) {
continue;
}
std::string file, hash;
folly::split(" ", line, hash, file);
EXPECT_TRUE(actual.emplace(file, hash).second);
}
std::map<std::string, std::string> const expected{
{"empty", "d41d8cd98f00b204e9800998ecf8427e"},
{"somedir/empty", "d41d8cd98f00b204e9800998ecf8427e"},
{"test.pl", "d41d8cd98f00b204e9800998ecf8427e"},
{"baz.pl", "e2bd36391abfd15dcc83cbdfb60a6bc3"},
{"somedir/ipsum.py", "70fe813c36ed50ebd7f4991857683676"},
{"foo.pl", "e2bd36391abfd15dcc83cbdfb60a6bc3"},
{"bar.pl", "e2bd36391abfd15dcc83cbdfb60a6bc3"},
{"ipsum.txt", "0782b6a546cedd8be8fc86ac47dc6d96"},
};
EXPECT_EQ(expected, actual);
}
class mkdwarfs_sim_order_test : public testing::TestWithParam<char const*> {}; class mkdwarfs_sim_order_test : public testing::TestWithParam<char const*> {};
TEST(mkdwarfs_test, max_similarity_size) { TEST(mkdwarfs_test, max_similarity_size) {