fix: sanitize UTF-8 strings, plus tests (fixes gh #191)

This commit is contained in:
Marcus Holland-Moritz 2024-02-06 08:59:10 +01:00
parent 6d5f8dcab5
commit 6f738f8f02
5 changed files with 33 additions and 8 deletions

View File

@ -63,6 +63,7 @@ int call_sys_main_iolayer(std::span<std::string> args, iolayer const& iol,
size_t utf8_display_width(char const* p, size_t len); size_t utf8_display_width(char const* p, size_t len);
size_t utf8_display_width(std::string const& str); size_t utf8_display_width(std::string const& str);
void utf8_truncate(std::string& str, size_t len); void utf8_truncate(std::string& str, size_t len);
void utf8_sanitize(std::string& str);
void shorten_path_string(std::string& path, char separator, size_t max_len); void shorten_path_string(std::string& path, char separator, size_t max_len);

View File

@ -91,9 +91,15 @@ void output_context_line(terminal const& term, std::ostream& os,
assert(width >= progress_w + speed_w + 1); assert(width >= progress_w + speed_w + 1);
std::string path;
if (st.path) {
path = *st.path;
utf8_sanitize(path);
}
size_t status_w = width - (progress_w + speed_w + 1); size_t status_w = width - (progress_w + speed_w + 1);
auto path_len = st.path ? utf8_display_width(*st.path) : 0; auto path_len = !path.empty() ? utf8_display_width(path) : 0;
size_t extra_len = st.path && !st.status_string.empty() ? 2 : 0; size_t extra_len = !path.empty() && !st.status_string.empty() ? 2 : 0;
if (status_w < if (status_w <
st.context.size() + st.status_string.size() + path_len + extra_len) { st.context.size() + st.status_string.size() + path_len + extra_len) {
@ -106,11 +112,10 @@ void output_context_line(terminal const& term, std::ostream& os,
if (max_path_len > 0) { if (max_path_len > 0) {
shorten_path_string( shorten_path_string(
*st.path, path, static_cast<char>(std::filesystem::path::preferred_separator),
static_cast<char>(std::filesystem::path::preferred_separator),
max_path_len); max_path_len);
path_len = utf8_display_width(*st.path); path_len = utf8_display_width(path);
} }
} }
@ -128,7 +133,7 @@ void output_context_line(terminal const& term, std::ostream& os,
if (!st.status_string.empty()) { if (!st.status_string.empty()) {
st.status_string += ": "; st.status_string += ": ";
} }
st.status_string += *st.path; st.status_string += path;
} }
std::string progress; std::string progress;

View File

@ -265,6 +265,7 @@ std::string status_string(progress const& p, size_t width) {
label = "writing: "; label = "writing: ";
path = i->any()->path_as_string(); path = i->any()->path_as_string();
} }
utf8_sanitize(path);
shorten_path_string( shorten_path_string(
path, static_cast<char>(std::filesystem::path::preferred_separator), path, static_cast<char>(std::filesystem::path::preferred_separator),
width - label.size()); width - label.size());

View File

@ -265,6 +265,12 @@ void utf8_truncate(std::string& str, size_t len) {
str.resize(p - str.data()); str.resize(p - str.data());
} }
void utf8_sanitize(std::string& str) {
if (!utf8::is_valid(str)) [[unlikely]] {
str = utf8::replace_invalid(str);
}
}
void shorten_path_string(std::string& path, char separator, size_t max_len) { void shorten_path_string(std::string& path, char separator, size_t max_len) {
auto len = utf8_display_width(path); auto len = utf8_display_width(path);

View File

@ -154,6 +154,7 @@ struct random_file_tree_options {
int dimension{20}; int dimension{20};
int max_name_len{50}; int max_name_len{50};
bool with_errors{false}; bool with_errors{false};
bool with_invalid_utf8{false};
}; };
class mkdwarfs_tester : public tester_common { class mkdwarfs_tester : public tester_common {
@ -193,10 +194,14 @@ class mkdwarfs_tester : public tester_common {
std::mt19937_64 rng{42}; std::mt19937_64 rng{42};
std::exponential_distribution<> size_dist{1 / opt.avg_size}; std::exponential_distribution<> size_dist{1 / opt.avg_size};
std::uniform_int_distribution<> path_comp_size_dist{0, opt.max_name_len}; std::uniform_int_distribution<> path_comp_size_dist{0, opt.max_name_len};
std::uniform_int_distribution<> invalid_dist{0, 1};
std::vector<std::pair<fs::path, std::string>> paths; std::vector<std::pair<fs::path, std::string>> paths;
auto random_path_component = [&] { auto random_path_component = [&] {
auto size = path_comp_size_dist(rng); auto size = path_comp_size_dist(rng);
if (opt.with_invalid_utf8 && invalid_dist(rng) == 0) {
return test::create_random_string(size, 96, 255, rng);
}
return test::create_random_string(size, 'A', 'Z', rng); return test::create_random_string(size, 'A', 'Z', rng);
}; };
@ -213,7 +218,7 @@ class mkdwarfs_tester : public tester_common {
auto size = std::min(max_size, static_cast<size_t>(size_dist(rng))); auto size = std::min(max_size, static_cast<size_t>(size_dist(rng)));
std::string data; std::string data;
if (rng() % 2 == 0) { if (size < 1024 * 1024 && rng() % 2 == 0) {
data = test::create_random_string(size, rng); data = test::create_random_string(size, rng);
} else { } else {
data = test::loremipsum(size); data = test::loremipsum(size);
@ -1734,7 +1739,14 @@ TEST_P(mkdwarfs_progress_test, basic) {
t.iol->set_terminal_fancy(true); t.iol->set_terminal_fancy(true);
t.add_root_dir(); t.add_root_dir();
t.add_random_file_tree(); t.add_random_file_tree({
.avg_size = 20.0 * 1024 * 1024,
.dimension = 2,
#ifndef _WIN32
// Windows can't deal with non-UTF-8 filenames
.with_invalid_utf8 = true,
#endif
});
t.os->add_local_files(audio_data_dir); t.os->add_local_files(audio_data_dir);
t.os->add_local_files(fits_data_dir); t.os->add_local_files(fits_data_dir);