fix(windows): handle invalid wide chars in file names (fixes gh #241)

For some reason, Windows allows invalid UTF-16 characters in file names.
Try to handle these gracefully when converting to UTF-8.
This commit is contained in:
Marcus Holland-Moritz 2024-10-10 19:45:24 +02:00
parent e8f084d183
commit 7431bb627c
5 changed files with 65 additions and 12 deletions

View File

@ -55,6 +55,7 @@ void utf8_sanitize(std::string& str);
void shorten_path_string(std::string& path, char separator, size_t max_len); void shorten_path_string(std::string& path, char separator, size_t max_len);
std::filesystem::path canonical_path(std::filesystem::path p); std::filesystem::path canonical_path(std::filesystem::path p);
std::string path_to_utf8_string_sanitized(std::filesystem::path const& p);
bool getenv_is_enabled(char const* var); bool getenv_is_enabled(char const* var);

View File

@ -121,6 +121,9 @@ class entry : public entry_interface {
private: private:
std::u8string u8name() const; std::u8string u8name() const;
#ifdef _WIN32
std::filesystem::path path_;
#endif
std::string name_; std::string name_;
std::weak_ptr<entry> parent_; std::weak_ptr<entry> parent_;
file_stat stat_; file_stat stat_;

View File

@ -26,6 +26,7 @@
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>
#include <iostream> #include <iostream>
#include <type_traits>
#if __has_include(<utf8cpp/utf8.h>) #if __has_include(<utf8cpp/utf8.h>)
#include <utf8cpp/utf8.h> #include <utf8cpp/utf8.h>
@ -264,6 +265,25 @@ std::filesystem::path canonical_path(std::filesystem::path p) {
return p; return p;
} }
std::string path_to_utf8_string_sanitized(std::filesystem::path const& p) {
#ifdef _WIN32
if constexpr (std::is_same_v<std::filesystem::path::value_type, wchar_t>) {
auto const& in = p.native();
if (in.empty()) {
return {};
}
int size_needed = ::WideCharToMultiByte(
CP_UTF8, 0, in.data(), (int)in.size(), NULL, 0, NULL, NULL);
std::string out(size_needed, 0);
::WideCharToMultiByte(CP_UTF8, 0, in.data(), (int)in.size(), &out[0],
size_needed, NULL, NULL);
return out;
}
#endif
return u8string_to_string(p.u8string());
}
bool getenv_is_enabled(char const* var) { bool getenv_is_enabled(char const* var) {
if (auto val = std::getenv(var)) { if (auto val = std::getenv(var)) {
if (auto maybeBool = try_to<bool>(val); maybeBool && *maybeBool) { if (auto maybeBool = try_to<bool>(val); maybeBool && *maybeBool) {

View File

@ -58,20 +58,19 @@ bool is_root_path(std::string_view path) {
#endif #endif
} }
std::string entry_name(fs::path const& path, bool has_parent) {
if (has_parent) {
return u8string_to_string(path.filename().u8string());
}
return u8string_to_string(path.u8string());
}
} // namespace } // namespace
entry::entry(fs::path const& path, std::shared_ptr<entry> parent, entry::entry(fs::path const& path, std::shared_ptr<entry> parent,
file_stat const& st) file_stat const& st)
: name_{entry_name(path, static_cast<bool>(parent))} #ifdef _WIN32
: path_{parent ? path.filename() : path}
, name_{path_to_utf8_string_sanitized(path_)}
#else
: name_{path_to_utf8_string_sanitized(parent ? path.filename() : path)}
#endif
, parent_{std::move(parent)} , parent_{std::move(parent)}
, stat_{st} {} , stat_{st} {
}
bool entry::has_parent() const { bool entry::has_parent() const {
if (parent_.lock()) { if (parent_.lock()) {
@ -88,11 +87,17 @@ void entry::set_name(const std::string& name) { name_ = name; }
std::u8string entry::u8name() const { return string_to_u8string(name_); } std::u8string entry::u8name() const { return string_to_u8string(name_); }
fs::path entry::fs_path() const { fs::path entry::fs_path() const {
#ifdef _WIN32
fs::path self = path_;
#else
fs::path self = name_;
#endif
if (auto parent = parent_.lock()) { if (auto parent = parent_.lock()) {
return parent->fs_path() / u8name(); return parent->fs_path() / self;
} }
return fs::path(u8name()); return self;
} }
std::string entry::path_as_string() const { std::string entry::path_as_string() const {

View File

@ -31,6 +31,7 @@
#include <stdexcept> #include <stdexcept>
#include <string> #include <string>
#include <system_error> #include <system_error>
#include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
@ -329,6 +330,7 @@ class scanner_ final : public scanner::impl {
os_access const& os_; os_access const& os_;
std::vector<std::unique_ptr<entry_filter>> filters_; std::vector<std::unique_ptr<entry_filter>> filters_;
std::vector<std::unique_ptr<entry_transformer>> transformers_; std::vector<std::unique_ptr<entry_transformer>> transformers_;
std::unordered_set<std::string> invalid_filenames_;
}; };
template <typename LoggerPolicy> template <typename LoggerPolicy>
@ -361,6 +363,27 @@ scanner_<LoggerPolicy>::add_entry(std::filesystem::path const& name,
file_scanner& fs, bool debug_filter) { file_scanner& fs, bool debug_filter) {
try { try {
auto pe = entry_factory_.create(os_, name, parent); auto pe = entry_factory_.create(os_, name, parent);
if constexpr (!std::is_same_v<std::filesystem::path::value_type, char>) {
try {
auto tmp [[maybe_unused]] = name.filename().u8string();
} catch (std::system_error const& e) {
LOG_ERROR << fmt::format(
"invalid file name in \"{}\", storing as \"{}\": {}",
path_to_utf8_string_sanitized(name.parent_path()), pe->name(),
e.what());
prog.errors++;
if (!invalid_filenames_.emplace(path_to_utf8_string_sanitized(name))
.second) {
LOG_ERROR << fmt::format(
"cannot store \"{}\" as the name already exists", pe->name());
return nullptr;
}
}
}
bool const exclude = bool const exclude =
std::any_of(filters_.begin(), filters_.end(), [&pe](auto const& f) { std::any_of(filters_.begin(), filters_.end(), [&pe](auto const& f) {
return f->filter(*pe) == filter_action::remove; return f->filter(*pe) == filter_action::remove;
@ -451,7 +474,8 @@ scanner_<LoggerPolicy>::add_entry(std::filesystem::path const& name,
return pe; return pe;
} catch (const std::system_error& e) { } catch (const std::system_error& e) {
LOG_ERROR << fmt::format("error reading entry (path={}): {}", name.string(), LOG_ERROR << fmt::format("error reading entry (path={}): {}",
path_to_utf8_string_sanitized(name),
exception_str(e)); exception_str(e));
prog.errors++; prog.errors++;
} }