feat: use glob-to-regex to simplify and enhance rule_based_entry_filter

2025-09-09 12:28:13 -04:00 · 2024-11-16 17:37:08 +01:00 · 2024-11-16 17:37:08 +01:00 · c427808bb1
commit c427808bb1
parent 5d19513829
4 changed files with 121 additions and 75 deletions
--- a/doc/mkdwarfs.md
+++ b/doc/mkdwarfs.md
@ -693,10 +693,11 @@ Patterns support `?` and `*` wildcards matching a single character
 and any number of characters, respectively. These patterns don't match
 across directory separators (`/`).
-Patterns also support the `**` wildcard, which matches across directory
+Patterns also support the `**` globstar wildcard, which matches across
-separators.
+directory separators.
-Patterns also support character classes.
+Patterns also support character classes (`[avt]`), ranges (`[a-h]`),
 and complementation (`[!a-h]`).
 Here's an exemplary rule set:
--- a/src/writer/rule_based_entry_filter.cpp
+++ b/src/writer/rule_based_entry_filter.cpp
@ -31,6 +31,8 @@
 #include <dwarfs/writer/entry_interface.h>
 #include <dwarfs/writer/rule_based_entry_filter.h>
 #include <dwarfs/internal/glob_to_regex.h>
 namespace dwarfs::writer {
 namespace internal {
@ -51,12 +53,22 @@ struct filter_rule {
  };
  filter_rule(rule_type type, bool floating, std::string const& re,
-              std::string const& rule)
+              std::string_view rule, bool ignore_case = false)
      : type{type}
      , floating{floating}
-      , re{re}
+      , re{re, regex_flags(ignore_case)}
      , rule{rule} {}
  static constexpr std::regex_constants::syntax_option_type
  regex_flags(bool ignore_case) {
    auto flags =
        std::regex_constants::ECMAScript | std::regex_constants::optimize;
    if (ignore_case) {
      flags |= std::regex_constants::icase;
    }
    return flags;
  }
  rule_type type;
  bool floating;
  std::regex re;
@ -89,14 +101,34 @@ class rule_based_entry_filter_ : public rule_based_entry_filter::impl {
 template <typename LoggerPolicy>
 auto rule_based_entry_filter_<LoggerPolicy>::compile_filter_rule(
-    std::string_view rule_sv) -> filter_rule {
+    std::string_view rule) -> filter_rule {
  std::string rule{rule_sv};
  std::string re;
  filter_rule::rule_type type;
-  auto* p = rule.c_str();
+  if (rule.empty()) {
    throw std::runtime_error("empty filter rule");
  }
-  switch (*p) {
+  auto splitpos = rule.find_first_of(' ');
  if (splitpos == std::string::npos) {
    throw std::runtime_error("invalid filter rule: " + std::string(rule));
  }
  auto pattern_start = rule.find_first_not_of(' ', splitpos);
  if (pattern_start == std::string::npos) {
    throw std::runtime_error("no pattern in filter rule: " + std::string(rule));
  }
  auto prefix = rule.substr(0, splitpos);
  auto pattern = rule.substr(pattern_start);
  if (prefix.empty()) {
    throw std::runtime_error("no prefix in filter rule: " + std::string(rule));
  }
  switch (prefix[0]) {
  case '+':
    type = filter_rule::rule_type::include;
    break;
@ -107,76 +139,34 @@ auto rule_based_entry_filter_<LoggerPolicy>::compile_filter_rule(
    throw std::runtime_error("rules must start with + or -");
  }
-  while (*++p == ' ')
+  bool ignore_case{false};
-    ;
+
  prefix.remove_prefix(1);
  for (auto opt : prefix) {
    switch (opt) {
    case 'i':
      ignore_case = true;
      break;
    default:
      throw std::runtime_error(
          fmt::format("unknown option '{}' in filter rule: {}", opt, rule));
    }
  }
  // If the start of the pattern is not explicitly anchored, make it floating.
-  bool floating = *p && *p != '/';
+  bool floating = pattern[0] != '/';
  if (floating) {
    re += ".*/";
  }
-  while (*p) {
+  re += dwarfs::internal::glob_to_regex_string(pattern) + "$";
    switch (*p) {
    case '\\':
      re += *p++;
      if (p) {
        re += *p++;
      }
      continue;
    case '*': {
      int nstar = 1;
      while (*++p == '*') {
        ++nstar;
      }
      switch (nstar) {
      case 1:
        if (re.ends_with('/') and (*p == '/' or *p == '\0')) {
          re += "[^/]+";
        } else {
          re += "[^/]*";
        }
        break;
      case 2:
        re += ".*";
        break;
      default:
        throw std::runtime_error("too many *s");
      }
    }
      continue;
    case '?':
      re += "[^/]";
      break;
    case '.':
    case '+':
    case '^':
    case '$':
    case '(':
    case ')':
    case '{':
    case '}':
    case '|':
      re += '\\';
      re += *p;
      break;
    default:
      re += *p;
      break;
    }
    ++p;
  }
  LOG_DEBUG << "'" << rule << "' -> '" << re << "' [floating=" << floating
-            << "]";
+            << ", ignore_case=" << ignore_case << "]";
-  return filter_rule(type, floating, re, rule);
+  return filter_rule(type, floating, re, rule, ignore_case);
 }
 template <typename LoggerPolicy>
--- a/test/filter_test_data.cpp
+++ b/test/filter_test_data.cpp
@ -79,6 +79,30 @@ R"(
  "usr/lib/Mcrt1.o",
  "usr/lib64",
  "usr/lib64/gcrt1.o",
 }},
 {
  "NoIgnoreCase",
 R"(
 + mc*
 + *led*
 - *
 )", {
  "",
 }},
 {
  "IgnoreCase",
 R"(
 +i mc*
 +i *led*
 - *
 )", {
  "",
  "usr",
  "usr/lib",
  "usr/lib/Mcrt1.o",
  "usr/lib64",
  "usr/lib64/xtables",
  "usr/lib64/xtables/libxt_LED.so",
 }},
    // clang-format on
 };
--- a/test/tool_main_test.cpp
+++ b/test/tool_main_test.cpp
@ -1370,6 +1370,43 @@ TEST(mkdwarfs_test, cannot_combine_input_list_and_filter) {
              ::testing::HasSubstr("cannot combine --input-list and --filter"));
 }
 TEST(mkdwarfs_test, rules_must_start_with_plus_or_minus) {
  auto t = mkdwarfs_tester::create_empty();
  EXPECT_NE(0, t.run({"-i", "/", "-o", "-", "-F", "% *"}));
  EXPECT_THAT(t.err(), ::testing::HasSubstr("rules must start with + or -"));
 }
 TEST(mkdwarfs_test, empty_filter_rule) {
  auto t = mkdwarfs_tester::create_empty();
  EXPECT_NE(0, t.run({"-i", "/", "-o", "-", "-F", ""}));
  EXPECT_THAT(t.err(), ::testing::HasSubstr("empty filter rule"));
 }
 TEST(mkdwarfs_test, invalid_filter_rule) {
  auto t = mkdwarfs_tester::create_empty();
  EXPECT_NE(0, t.run({"-i", "/", "-o", "-", "-F", "+i"}));
  EXPECT_THAT(t.err(), ::testing::HasSubstr("invalid filter rule"));
 }
 TEST(mkdwarfs_test, no_pattern_in_filter_rule) {
  auto t = mkdwarfs_tester::create_empty();
  EXPECT_NE(0, t.run({"-i", "/", "-o", "-", "-F", "+  "}));
  EXPECT_THAT(t.err(), ::testing::HasSubstr("no pattern in filter rule"));
 }
 TEST(mkdwarfs_test, no_prefix_in_filter_rule) {
  auto t = mkdwarfs_tester::create_empty();
  EXPECT_NE(0, t.run({"-i", "/", "-o", "-", "-F", " foo"}));
  EXPECT_THAT(t.err(), ::testing::HasSubstr("no prefix in filter rule"));
 }
 TEST(mkdwarfs_test, unknown_option_in_filter_rule) {
  auto t = mkdwarfs_tester::create_empty();
  EXPECT_NE(0, t.run({"-i", "/", "-o", "-", "-F", "+x foo"}));
  EXPECT_THAT(t.err(),
              ::testing::HasSubstr("unknown option 'x' in filter rule"));
 }
 TEST(mkdwarfs_test, cannot_open_input_list_file) {
  mkdwarfs_tester t;
  EXPECT_NE(0, t.run({"--input-list", "missing.list", "-o", "-"}));
@ -1639,12 +1676,6 @@ TEST(mkdwarfs_test, invalid_progress_mode) {
  EXPECT_THAT(t.err(), ::testing::HasSubstr("invalid progress mode"));
 }
 TEST(mkdwarfs_test, invalid_filter_rule) {
  mkdwarfs_tester t;
  EXPECT_NE(0, t.run({"-i", "/", "-o", "-", "-F", "grmpf"}));
  EXPECT_THAT(t.err(), ::testing::HasSubstr("could not parse filter rule"));
 }
 TEST(mkdwarfs_test, time_resolution_zero) {
  mkdwarfs_tester t;
  EXPECT_NE(0, t.run({"-i", "/", "-o", "-", "--time-resolution=0"}));