fix(config): remove trailing newlines in regexes (#373)

Closes #372

Fun YAML fact of the day:

What is the difference between how these two expressions are parsed?

```yaml
foo: >
  bar
```

```yaml
foo: >-
  bar
```

They are invisible in yaml, but when you evaluate them to JSON the
difference is obvious:

```json
{
  "foo": "bar\n"
}
```

```json
{
  "foo": "bar"
}
```

User-Agent strings, URL path values, and HTTP headers _do_ end in
newlines in HTTP/1.1 wire form, but that newline is usually stripped
before the server actually handles it. Also HTTP/2 is a thing and does
not terminate header values with newlines.

This change makes Anubis more aggressively detect mistaken uses of the
yaml `>` operator and nudges the user into using the yaml `>-` operator
which does not append the trailing newline.

I had honestly forgotten about this YAML behavior because it wasn't
relevant for so long. Oops! Glad I released a beta.

Whenever you get into this state, Anubis will throw a config parsing
error and then give you a message hinting at the folly of your ways.

```
config.Bot: regular expression ends with newline (try >- instead of > in yaml)
```

Big thanks to https://yaml-multiline.info, this helped me realize my
folly instantly.

@aiverson, this is official permission to say "told you so".

Signed-off-by: Xe Iaso <me@xeiaso.net>
This commit is contained in:
Xe Iaso 2025-04-26 10:01:15 -04:00 committed by GitHub
parent c669b47b57
commit ef52550e70
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 61 additions and 9 deletions

View File

@ -41,7 +41,7 @@
},
{
"name": "generic-browser",
"user_agent_regex": "Mozilla|Opera\n",
"user_agent_regex": "Mozilla|Opera",
"action": "CHALLENGE"
}
],

View File

@ -43,7 +43,7 @@ bots:
# Generic catchall rule
- name: generic-browser
user_agent_regex: >
user_agent_regex: >-
Mozilla|Opera
action: CHALLENGE

View File

@ -1,4 +1,4 @@
- name: "ai-robots-txt"
user_agent_regex: >
user_agent_regex: >-
AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|Perplexity-User|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot
action: DENY

View File

@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
- Ensure regexes can't end in newlines ([#372](https://github.com/TecharoHQ/anubis/issues/372))
- Add documentation for default allow behavior (implicit rule)
- Enable [importing configuration snippets](./admin/configuration/import.mdx) ([#321](https://github.com/TecharoHQ/anubis/pull/321))
- Refactor check logic to be more generic and work on a Checker type

View File

@ -110,11 +110,11 @@ func NewUserAgentChecker(rexStr string) (Checker, error) {
}
func NewHeaderMatchesChecker(header, rexStr string) (Checker, error) {
rex, err := regexp.Compile(rexStr)
rex, err := regexp.Compile(strings.TrimSpace(rexStr))
if err != nil {
return nil, fmt.Errorf("%w: regex %s failed parse: %w", ErrMisconfiguration, rexStr, err)
}
return &HeaderMatchesChecker{header, rex, internal.SHA256sum(header + ": " + rexStr)}, nil
return &HeaderMatchesChecker{strings.TrimSpace(header), rex, internal.SHA256sum(header + ": " + rexStr)}, nil
}
func (hmc *HeaderMatchesChecker) Check(r *http.Request) (bool, error) {
@ -135,7 +135,7 @@ type PathChecker struct {
}
func NewPathChecker(rexStr string) (Checker, error) {
rex, err := regexp.Compile(rexStr)
rex, err := regexp.Compile(strings.TrimSpace(rexStr))
if err != nil {
return nil, fmt.Errorf("%w: regex %s failed parse: %w", ErrMisconfiguration, rexStr, err)
}
@ -155,7 +155,7 @@ func (pc *PathChecker) Hash() string {
}
func NewHeaderExistsChecker(key string) Checker {
return headerExistsChecker{key}
return headerExistsChecker{strings.TrimSpace(key)}
}
type headerExistsChecker struct {
@ -180,11 +180,11 @@ func NewHeadersChecker(headermap map[string]string) (Checker, error) {
for key, rexStr := range headermap {
if rexStr == ".*" {
result = append(result, headerExistsChecker{key})
result = append(result, headerExistsChecker{strings.TrimSpace(key)})
continue
}
rex, err := regexp.Compile(rexStr)
rex, err := regexp.Compile(strings.TrimSpace(rexStr))
if err != nil {
errs = append(errs, fmt.Errorf("while compiling header %s regex %s: %w", key, rexStr, err))
continue

View File

@ -24,6 +24,7 @@ var (
ErrInvalidPathRegex = errors.New("config.Bot: invalid path regex")
ErrInvalidHeadersRegex = errors.New("config.Bot: invalid headers regex")
ErrInvalidCIDR = errors.New("config.Bot: invalid CIDR")
ErrRegexEndsWithNewline = errors.New("config.Bot: regular expression ends with newline (try >- instead of > in yaml)")
ErrInvalidImportStatement = errors.New("config.ImportStatement: invalid source file")
ErrCantSetBotAndImportValuesAtOnce = errors.New("config.BotOrImport: can't set bot rules and import values at the same time")
ErrMustSetBotOrImportRules = errors.New("config.BotOrImport: rule definition is invalid, you must set either bot rules or an import statement, not both")
@ -91,12 +92,20 @@ func (b BotConfig) Valid() error {
}
if b.UserAgentRegex != nil {
if strings.HasSuffix(*b.UserAgentRegex, "\n") {
errs = append(errs, fmt.Errorf("%w: user agent regex: %q", ErrRegexEndsWithNewline, *b.UserAgentRegex))
}
if _, err := regexp.Compile(*b.UserAgentRegex); err != nil {
errs = append(errs, ErrInvalidUserAgentRegex, err)
}
}
if b.PathRegex != nil {
if strings.HasSuffix(*b.PathRegex, "\n") {
errs = append(errs, fmt.Errorf("%w: path regex: %q", ErrRegexEndsWithNewline, *b.PathRegex))
}
if _, err := regexp.Compile(*b.PathRegex); err != nil {
errs = append(errs, ErrInvalidPathRegex, err)
}
@ -108,6 +117,10 @@ func (b BotConfig) Valid() error {
continue
}
if strings.HasSuffix(expr, "\n") {
errs = append(errs, fmt.Errorf("%w: header %s regex: %q", ErrRegexEndsWithNewline, name, expr))
}
if _, err := regexp.Compile(expr); err != nil {
errs = append(errs, ErrInvalidHeadersRegex, err)
}

View File

@ -0,0 +1,21 @@
{
"bots": [
{
"name": "user-agent-ends-newline",
"user_agent_regex": "Mozilla\n",
"action": "CHALLENGE"
},
{
"name": "path-ends-newline",
"path_regex": "^/evil/.*$\n",
"action": "CHALLENGE"
},
{
"name": "headers-ends-newline",
"headers_regex": {
"CF-Worker": ".*\n"
},
"action": "CHALLENGE"
}
]
}

View File

@ -0,0 +1,17 @@
bots:
- name: user-agent-ends-newline
# Subtle bug: this ends with a newline
user_agent_regex: >
Mozilla
action: CHALLENGE
- name: path-ends-newline
# Subtle bug: this ends with a newline
path_regex: >
^/evil/.*$
action: CHALLENGE
- name: headers-ends-newline
# Subtle bug: this ends with a newline
headers_regex:
CF-Worker: >
.*
action: CHALLENGE