feat: support case-insensitive lookups (fixes gh #232)

This commit is contained in:
Marcus Holland-Moritz 2024-11-18 00:47:02 +01:00
parent f01f4a4ed2
commit b349e584e0
13 changed files with 3546 additions and 12 deletions

View File

@ -0,0 +1,38 @@
#!/usr/bin/perl -w
use strict;
use warnings;
# Input data: https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt
my %map;
while (<>) {
chomp;
next if /^(#|\s*$)/;
my($char, $status, $fold, $comment) = split /\s*;\s*/;
if ($status =~ /^[CS]$/) {
$comment =~ s/^#\s*//;
# print " case 0x$char: return 0x$fold; // [$status] $comment\n";
$map{hex($char)} = hex($fold);
}
}
my @valid_code_points = (0..0xD7FF, 0xE000..0x10FFFF);
sub cp_to_str {
my $cp = shift;
my $fmt = $cp < 0x10000 ? "\\u%04X" : "\\U%08X";
return sprintf $fmt, $cp;
}
while (@valid_code_points) {
my @cps = splice @valid_code_points, 0, 256;
my $orig;
my $folded;
for my $cp (@cps) {
my $fold = $map{$cp} // $cp;
$orig .= cp_to_str($cp);
$folded .= cp_to_str($fold);
}
print " {u8\"$orig\"sv, u8\"$folded\"sv},\n" if $orig ne $folded;
}

View File

@ -450,6 +450,7 @@ if(WITH_TESTS)
pcmaudio_categorizer_test
speedometer_test
terminal_test
unicode_test
utils_test
file_utils_test
worker_group_test

View File

@ -49,6 +49,7 @@ add_library(
src/internal/fs_section.cpp
src/internal/glob_to_regex.cpp
src/internal/string_table.cpp
src/internal/unicode_case_folding.cpp
src/internal/wcwidth.c
src/internal/worker_group.cpp

View File

@ -104,6 +104,17 @@ options:
overlays and want the file system to reflect its read-only
state, you can set this option.
- `-o case_insensitive`:
Perform case-insensitive lookups in the mounted file system,
i.e. an entry orignally named `ReadMe.txt` can be accessed as
`readme.txt`, `README.TXT`, or `rEaDmE.tXt`. This works across
all platforms. When mounting a file system with many files, this
may be slightly slower and consume slightly more memory as case-
insensitive lookup requires an additional mapping table that is
built on-demand. Note that this is not supported if the file
system contains directories with entries that only differ in
case.
- `-o (no_)cache_image`:
By default, `dwarfs` tries to ensure that the compressed file
system image will not be cached by the kernel (i.e. the default

View File

@ -0,0 +1,32 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/
#pragma once
#include <string>
#include <string_view>
namespace dwarfs::internal {
std::string utf8_case_fold(std::string_view in);
std::string utf8_case_fold_unchecked(std::string_view in);
} // namespace dwarfs::internal

View File

@ -32,6 +32,7 @@ struct metadata_options {
bool enable_nlink{false};
bool readonly{false};
bool check_consistency{false};
bool case_insensitive_lookup{false};
size_t block_size{512};
std::optional<file_stat::uid_type> fs_uid{};
std::optional<file_stat::gid_type> fs_gid{};

File diff suppressed because it is too large Load Diff

View File

@ -32,19 +32,28 @@
#include <boost/algorithm/string.hpp>
#include <boost/sort/flat_stable_sort/flat_stable_sort.hpp>
#include <thrift/lib/cpp2/frozen/FrozenUtil.h>
#include <thrift/lib/cpp2/protocol/DebugProtocol.h>
#include <thrift/lib/cpp2/protocol/Serializer.h>
#include <fmt/chrono.h>
#include <fmt/format.h>
#if FMT_VERSION >= 110000
#include <fmt/ranges.h>
#endif
#include <folly/container/F14Set.h>
#include <folly/portability/Stdlib.h>
#include <folly/portability/Unistd.h>
#include <folly/small_vector.h>
#include <folly/stats/Histogram.h>
#include <parallel_hashmap/phmap.h>
#include <range/v3/view/enumerate.hpp>
#include <range/v3/view/transform.hpp>
#include <dwarfs/error.h>
#include <dwarfs/file_stat.h>
@ -60,6 +69,7 @@
#include <dwarfs/internal/features.h>
#include <dwarfs/internal/packed_int_vector.h>
#include <dwarfs/internal/string_table.h>
#include <dwarfs/internal/unicode_case_folding.h>
#include <dwarfs/reader/internal/metadata_v2.h>
#include <dwarfs/gen-cpp2/metadata_layouts.h>
@ -415,7 +425,7 @@ class metadata_ final : public metadata_v2::impl {
, symlinks_(meta_.compact_symlinks()
? string_table(lgr, "symlinks", *meta_.compact_symlinks())
: string_table(meta_.symlinks()))
// clang-format off
, dir_icase_cache_{build_dir_icase_cache()} // clang-format off
PERFMON_CLS_PROXY_INIT(perfmon, "metadata_v2")
PERFMON_CLS_TIMER_INIT(find)
PERFMON_CLS_TIMER_INIT(getattr)
@ -661,6 +671,10 @@ class metadata_ final : public metadata_v2::impl {
nlohmann::json as_json(dir_entry_view entry) const;
nlohmann::json as_json(directory_view dir, dir_entry_view entry) const;
std::optional<dir_entry_view>
find_impl(directory_view dir, auto const& range, auto const& name,
auto const& index_map, auto const& entry_name_transform) const;
std::optional<dir_entry_view>
find(directory_view dir, std::string_view name) const;
@ -920,6 +934,79 @@ class metadata_ final : public metadata_v2::impl {
return packed_nlinks;
}
std::vector<packed_int_vector<uint32_t>> build_dir_icase_cache() const {
std::vector<packed_int_vector<uint32_t>> cache;
if (options_.case_insensitive_lookup) {
auto ti = LOG_TIMED_INFO;
size_t num_cached_dirs = 0;
size_t num_cached_files = 0;
size_t total_cache_size = 0;
cache.resize(meta_.directories().size());
for (uint32_t inode = 0; inode < meta_.directories().size() - 1;
++inode) {
directory_view dir{inode, global_};
auto range = dir.entry_range();
// Cache the folded names of the directory entries; this significantly
// speeds up the sorting code.
std::vector<std::string> names(range.size());
std::transform(range.begin(), range.end(), names.begin(), [&](auto ix) {
return utf8_case_fold_unchecked(
dir_entry_view_impl::name(ix, global_));
});
// Check and report any collisions in the directory
phmap::flat_hash_map<std::string_view, folly::small_vector<uint32_t, 1>>
collisions;
collisions.reserve(range.size());
for (size_t i = 0; i < names.size(); ++i) {
collisions[names[i]].push_back(i);
}
for (auto& [name, indices] : collisions) {
if (indices.size() > 1) {
LOG_WARN << fmt::format(
"case-insensitive collision in directory \"{}\" (inode={}): {}",
dir.self_entry_view().unix_path(), inode,
fmt::join(indices | ranges::views::transform([&](auto i) {
return dir_entry_view_impl::name(range[i], global_);
}),
", "));
}
}
// It's faster to check here if the folded names are sorted than to
// check later if the indices in `entries` are sorted.
if (!std::is_sorted(names.begin(), names.end())) {
std::vector<uint32_t> entries(range.size());
std::iota(entries.begin(), entries.end(), 0);
boost::sort::flat_stable_sort(
entries.begin(), entries.end(),
[&](auto a, auto b) { return names[a] < names[b]; });
auto& pv = cache[inode];
pv.reset(std::bit_width(entries.size()), entries.size());
for (size_t i = 0; i < entries.size(); ++i) {
pv.set(i, entries[i]);
}
++num_cached_dirs;
num_cached_files += entries.size();
total_cache_size += pv.size_in_bytes();
}
}
ti << "built case-insensitive directory cache for " << num_cached_files
<< " entries in " << num_cached_dirs << " out of "
<< meta_.directories().size() - 1 << " directories ("
<< size_with_unit(total_cache_size +
sizeof(decltype(cache)::value_type) * cache.size())
<< ")";
}
return cache;
}
size_t total_file_entries() const {
return (dev_inode_offset_ - file_inode_offset_) +
(meta_.dir_entries()
@ -943,6 +1030,7 @@ class metadata_ final : public metadata_v2::impl {
const int unique_files_;
const metadata_options options_;
const string_table symlinks_;
std::vector<packed_int_vector<uint32_t>> const dir_icase_cache_;
PERFMON_CLS_PROXY_DECL
PERFMON_CLS_TIMER_DECL(find)
PERFMON_CLS_TIMER_DECL(getattr)
@ -1687,6 +1775,31 @@ void metadata_<LoggerPolicy>::walk_data_order_impl(
}
}
template <typename LoggerPolicy>
std::optional<dir_entry_view>
metadata_<LoggerPolicy>::find_impl(directory_view dir, auto const& range,
auto const& name, auto const& index_map,
auto const& entry_name_transform) const {
auto entry_name = [&](auto ix) {
return entry_name_transform(dir_entry_view_impl::name(ix, global_));
};
auto it = std::lower_bound(range.begin(), range.end(), name,
[&](auto ix, auto const& name) {
return entry_name(index_map(ix)) < name;
});
if (it != range.end()) {
auto ix = index_map(*it);
if (entry_name(ix) == name) {
return dir_entry_view{dir_entry_view_impl::from_dir_entry_index_shared(
ix, global_.self_dir_entry(dir.inode()), global_)};
}
}
return std::nullopt;
}
template <typename LoggerPolicy>
std::optional<dir_entry_view>
metadata_<LoggerPolicy>::find(directory_view dir, std::string_view name) const {
@ -1694,19 +1807,21 @@ metadata_<LoggerPolicy>::find(directory_view dir, std::string_view name) const {
auto range = dir.entry_range();
auto it = std::lower_bound(
range.begin(), range.end(), name, [&](auto ix, std::string_view name) {
return internal::dir_entry_view_impl::name(ix, global_) < name;
});
if (it != range.end()) {
if (internal::dir_entry_view_impl::name(*it, global_) == name) {
return dir_entry_view{dir_entry_view_impl::from_dir_entry_index_shared(
*it, global_.self_dir_entry(dir.inode()), global_)};
}
if (!options_.case_insensitive_lookup) {
return find_impl(dir, range, name, std::identity{}, std::identity{});
}
return std::nullopt;
auto const& cache = dir_icase_cache_[dir.inode()];
return find_impl(
dir, boost::irange(range.size()), utf8_case_fold(name),
[&cache, &range](auto ix) {
if (!cache.empty()) {
ix = cache[ix];
}
return range[ix];
},
utf8_case_fold_unchecked);
}
template <typename LoggerPolicy>

View File

@ -28,6 +28,7 @@
#include <sstream>
#include <vector>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
// This needs to be included *after* gtest.h
@ -2054,3 +2055,183 @@ TEST(filesystem, multi_image) {
EXPECT_EQ("baz", fs.read_string(fs.open(baz->inode())));
}
}
TEST(filesystem, case_insensitive_lookup) {
auto input = std::make_shared<test::os_access_mock>();
input->add_dir("");
input->add_dir(u8"hEllÖwÖrLD");
input->add_dir(u8"FÜñKÿStrÍñg");
input->add_dir(u8"unícødérøcks");
input->add_dir(u8"JÄLAPEÑOPEPPÉR");
input->add_dir(u8"SpIcYsÜsHiRoLL");
input->add_dir(u8"CAFÉMØCHAlatte");
input->add_dir(u8"ČhàŧGƤŦ");
input->add_dir(u8"lõREMÏpSüM");
input->add_dir(u8"ŠåmpŁËŠTrInG");
input->add_dir(u8"pythonprogramming");
input->add_dir(u8"DÃTâScïÊNcË");
input->add_dir(u8"AIISFÛTÛRË");
input->add_dir(u8"readability");
input->add_file(u8"TëStCãSeSçÉNâRïÖ", "testcasescenario");
input->add_file(u8"lõREMÏpSüM/ÆSTHETÎCcøding", "aestheticcoding");
input->add_file(u8"lõREMÏpSüM/smîLëyFÀÇë😊", "smileyface");
input->add_file(u8"lõREMÏpSüM/NØRTHèast", "northeast");
input->add_file(u8"lõREMÏpSüM/SPACEadventure", "spaceadventure");
input->add_file(u8"lõREMÏpSüM/cõMPLEXïTy🚀", "complexity");
input->add_file(u8"lõREMÏpSüM/thisisatest", "thisisatest");
input->add_file(u8"lõREMÏpSüM/thisISaTEST", "thisisatest");
std::vector<std::u8string> case_sensitive_dirs{
u8"/hEllÖwÖrLD", u8"/FÜñKÿStrÍñg", u8"/unícødérøcks",
u8"/JÄLAPEÑOPEPPÉR", u8"/SpIcYsÜsHiRoLL", u8"/CAFÉMØCHAlatte",
u8"/ČhàŧGƤŦ", u8"/lõREMÏpSüM", u8"/ŠåmpŁËŠTrInG",
u8"/pythonprogramming", u8"/DÃTâScïÊNcË", u8"/AIISFÛTÛRË",
u8"/readability",
};
std::vector<std::pair<std::u8string, std::string>> case_sensitive_files{
{u8"/TëStCãSeSçÉNâRïÖ", "testcasescenario"},
{u8"/lõREMÏpSüM/ÆSTHETÎCcøding", "aestheticcoding"},
{u8"/lõREMÏpSüM/smîLëyFÀÇë😊", "smileyface"},
{u8"/lõREMÏpSüM/NØRTHèast", "northeast"},
{u8"/lõREMÏpSüM/SPACEadventure", "spaceadventure"},
{u8"/lõREMÏpSüM/cõMPLEXïTy🚀", "complexity"},
{u8"/lõREMÏpSüM/thisisatest", "thisisatest"},
{u8"/lõREMÏpSüM/thisISaTEST", "thisisatest"},
};
std::vector<std::u8string> case_insensitive_dirs{
u8"/HELlÖwÖRLD", u8"/FÜÑKÿSTríÑg", u8"/uNÍcødéRøcks",
u8"/JÄLApeñOPePPÉR", u8"/SpiCysÜshiRoLL", u8"/CAféMØchAlatte",
u8"/čhàŧgƥŧ", u8"/lõremÏpsüM", u8"/šåmpŁëšTrInG",
u8"/pyTHonproGRamming", u8"/DãtÂScïêNcË", u8"/AiisFÛTÛRË",
u8"/reADabiLIty",
};
std::vector<std::pair<std::u8string, std::string>> case_insensitive_files{
{u8"/TësTcãSeSçéNâRïÖ", "testcasescenario"},
{u8"/lõRemïpSüM/ÆstHETÎCcØDing", "aestheticcoding"},
{u8"/lõremïPSüM/smîlËYfàÇë😊", "smileyface"},
{u8"/lõREMÏPsÜM/NØRthÈAst", "northeast"},
{u8"/lõRemïPsüM/SPACEadvENTure", "spaceadventure"},
{u8"/LÕREMÏpSüM/CõMPlexïTy🚀", "complexity"},
{u8"/lõrEMÏpSüM/thiSISatest", "thisisatest"},
};
std::vector<std::u8string> non_matching_entries{
u8"/HELlÖwÖRLDx",
u8"/FÜÑKÿSTríÑj",
u8"/uNÍcødéRcks",
u8"/JÄLApeñOPePPÉ",
u8"/SpiCysÜshiRoLLx",
u8"/CAféMØchAltte",
u8"/čhàŧgƥŧx",
u8"/lõremÏpsü",
u8"/šåmpŁëšTrnG",
u8"/pyTHonproGRammin",
u8"/DãtÂScïêNcËx",
u8"/AiisFÛTÛTË",
u8"/reADabiLItx",
u8"/TësRcãSeSçéNâRïÖ",
u8"/lõRemïpüM/ÆstHETÎCcØDing",
u8"/lõremïPSüM/mîlËYfàÇë😊",
u8"/lõRMÏPsÜM/NØRthÈAst",
u8"/lõRemïPsüM/SPACEadvENTurex",
u8"/LÕREMÏpSüM/CõMPexïTy🚀",
u8"/lõrEMÏpSüM/thiSISatesy",
};
test::test_logger lgr;
auto fsimage = build_dwarfs(lgr, input, "null");
auto mm = std::make_shared<test::mmap_mock>(std::move(fsimage));
lgr.clear();
{
reader::filesystem_v2 fs(lgr, *input, mm,
{.metadata = {.case_insensitive_lookup = false}});
EXPECT_TRUE(lgr.empty());
for (auto const& dir : case_sensitive_dirs) {
auto name = u8string_to_string(dir);
auto dev = fs.find(name);
EXPECT_TRUE(dev) << name;
}
for (auto const& [file, content] : case_sensitive_files) {
auto name = u8string_to_string(file);
auto dev = fs.find(name);
EXPECT_TRUE(dev) << name;
EXPECT_EQ(content, fs.read_string(fs.open(dev->inode()))) << name;
}
for (auto const& dir : case_insensitive_dirs) {
auto name = u8string_to_string(dir);
auto dev = fs.find(name);
EXPECT_FALSE(dev) << name;
}
for (auto const& [file, content] : case_insensitive_files) {
auto name = u8string_to_string(file);
auto dev = fs.find(name);
EXPECT_FALSE(dev) << name;
}
for (auto const& ent : non_matching_entries) {
auto name = u8string_to_string(ent);
auto dev = fs.find(name);
EXPECT_FALSE(dev) << name;
}
}
lgr.clear();
{
reader::filesystem_v2 fs(lgr, *input, mm,
{.metadata = {.case_insensitive_lookup = true}});
EXPECT_THAT(
lgr.get_log(),
testing::Contains(testing::ResultOf(
[](const auto& entry) { return entry.output; },
testing::AllOf(testing::HasSubstr(u8string_to_string(
u8"case-insensitive collision in directory "
u8"\"lõREMÏpSüM\" (inode=")),
testing::HasSubstr("thisISaTEST, thisisatest")))));
for (auto const& dir : case_sensitive_dirs) {
auto name = u8string_to_string(dir);
auto dev = fs.find(name);
EXPECT_TRUE(dev) << name;
}
for (auto const& [file, content] : case_sensitive_files) {
auto name = u8string_to_string(file);
auto dev = fs.find(name);
EXPECT_TRUE(dev) << name;
EXPECT_EQ(content, fs.read_string(fs.open(dev->inode()))) << name;
}
for (auto const& dir : case_insensitive_dirs) {
auto name = u8string_to_string(dir);
auto dev = fs.find(name);
EXPECT_TRUE(dev) << name;
}
for (auto const& [file, content] : case_insensitive_files) {
auto name = u8string_to_string(file);
auto dev = fs.find(name);
EXPECT_TRUE(dev) << name;
EXPECT_EQ(content, fs.read_string(fs.open(dev->inode()))) << name;
}
for (auto const& ent : non_matching_entries) {
auto name = u8string_to_string(ent);
auto dev = fs.find(name);
EXPECT_FALSE(dev) << name;
}
}
}

View File

@ -1053,6 +1053,7 @@ TEST_P(tools_test, end_to_end) {
std::vector<std::string> all_options{
"-s",
"-ocase_insensitive",
#ifndef _WIN32
"-oenable_nlink",
"-oreadonly",
@ -1074,6 +1075,7 @@ TEST_P(tools_test, end_to_end) {
for (unsigned bitmask = 0; bitmask < combinations; ++bitmask) {
std::vector<std::string> args;
bool case_insensitive{false};
#ifndef _WIN32
bool enable_nlink{false};
bool readonly{false};
@ -1083,6 +1085,9 @@ TEST_P(tools_test, end_to_end) {
for (size_t i = 0; i < all_options.size(); ++i) {
if ((1 << i) & bitmask) {
auto const& opt = all_options[i];
if (opt == "-ocase_insensitive") {
case_insensitive = true;
}
#ifndef _WIN32
if (opt == "-oreadonly") {
readonly = true;
@ -1139,6 +1144,12 @@ TEST_P(tools_test, end_to_end) {
EXPECT_EQ(st.st_gid, 3456) << runner.cmdline();
}
#endif
EXPECT_TRUE(fs::exists(mountpoint / "format.sh")) << runner.cmdline();
EXPECT_EQ(case_insensitive, fs::exists(mountpoint / "FORMAT.SH"))
<< runner.cmdline();
EXPECT_EQ(case_insensitive, fs::exists(mountpoint / "fOrMaT.Sh"))
<< runner.cmdline();
auto perfmon =
dwarfs::getxattr(mountpoint, "user.dwarfs.driver.perfmon");
#if DWARFS_PERFMON_ENABLED

111
test/unicode_test.cpp Normal file

File diff suppressed because one or more lines are too long

View File

@ -179,6 +179,7 @@ struct options {
#endif
int enable_nlink{0};
int readonly{0};
int case_insensitive{0};
int cache_image{0};
int cache_files{0};
size_t cachesize{0};
@ -258,6 +259,7 @@ constexpr struct ::fuse_opt dwarfs_opts[] = {
DWARFS_OPT("seq_detector=%s", seq_detector_thresh_str, 0),
DWARFS_OPT("enable_nlink", enable_nlink, 1),
DWARFS_OPT("readonly", readonly, 1),
DWARFS_OPT("case_insensitive", case_insensitive, 1),
DWARFS_OPT("cache_image", cache_image, 1),
DWARFS_OPT("no_cache_image", cache_image, 0),
DWARFS_OPT("cache_files", cache_files, 1),
@ -1224,6 +1226,7 @@ void usage(std::ostream& os, std::filesystem::path const& progname) {
<< " -o imagesize=NUM filesystem image size in bytes\n"
<< " -o enable_nlink show correct hardlink numbers\n"
<< " -o readonly show read-only file system\n"
<< " -o case_insensitive perform case-insensitive lookups\n"
<< " -o (no_)cache_image (don't) keep image in kernel cache\n"
<< " -o (no_)cache_files (don't) keep files in kernel cache\n"
<< " -o debuglevel=NAME " << logger::all_level_names() << "\n"
@ -1464,6 +1467,7 @@ void load_filesystem(dwarfs_userdata& userdata) {
fsopts.inode_reader.readahead = opts.readahead;
fsopts.metadata.enable_nlink = bool(opts.enable_nlink);
fsopts.metadata.readonly = bool(opts.readonly);
fsopts.metadata.case_insensitive_lookup = bool(opts.case_insensitive);
fsopts.metadata.block_size = opts.blocksize;
#ifndef _WIN32
fsopts.metadata.fs_uid = opts.fs_uid;

View File

@ -11,6 +11,7 @@
"boost-multi-index",
"boost-process",
"boost-program-options",
"boost-sort",
"boost-thread",
"boost-uuid",
"boost-variant",