diff --git a/cmd/robots2policy/main.go b/cmd/robots2policy/main.go index eaa4d7f..a0224d4 100644 --- a/cmd/robots2policy/main.go +++ b/cmd/robots2policy/main.go @@ -29,7 +29,7 @@ var ( ) type RobotsRule struct { - UserAgent string + UserAgents []string Disallows []string Allows []string CrawlDelay int @@ -133,7 +133,10 @@ func main() { func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) { scanner := bufio.NewScanner(input) var rules []RobotsRule - var currentRule *RobotsRule + var currentUserAgents []string + var currentDisallows []string + var currentAllows []string + var currentCrawlDelay int for scanner.Scan() { line := strings.TrimSpace(scanner.Text()) @@ -154,38 +157,58 @@ func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) { switch directive { case "user-agent": - // Start a new rule section - if currentRule != nil { - rules = append(rules, *currentRule) - } - currentRule = &RobotsRule{ - UserAgent: value, - Disallows: make([]string, 0), - Allows: make([]string, 0), + // If we have accumulated rules with directives and encounter a new user-agent, + // flush the current rules + if len(currentUserAgents) > 0 && (len(currentDisallows) > 0 || len(currentAllows) > 0 || currentCrawlDelay > 0) { + rule := RobotsRule{ + UserAgents: make([]string, len(currentUserAgents)), + Disallows: make([]string, len(currentDisallows)), + Allows: make([]string, len(currentAllows)), + CrawlDelay: currentCrawlDelay, + } + copy(rule.UserAgents, currentUserAgents) + copy(rule.Disallows, currentDisallows) + copy(rule.Allows, currentAllows) + rules = append(rules, rule) + // Reset for next group + currentUserAgents = nil + currentDisallows = nil + currentAllows = nil + currentCrawlDelay = 0 } + currentUserAgents = append(currentUserAgents, value) case "disallow": - if currentRule != nil && value != "" { - currentRule.Disallows = append(currentRule.Disallows, value) + if len(currentUserAgents) > 0 && value != "" { + currentDisallows = append(currentDisallows, value) } case "allow": - if currentRule != nil && value != "" { - currentRule.Allows = append(currentRule.Allows, value) + if len(currentUserAgents) > 0 && value != "" { + currentAllows = append(currentAllows, value) } case "crawl-delay": - if currentRule != nil { + if len(currentUserAgents) > 0 { if delay, err := parseIntSafe(value); err == nil { - currentRule.CrawlDelay = delay + currentCrawlDelay = delay } } } } - // Don't forget the last rule - if currentRule != nil { - rules = append(rules, *currentRule) + // Don't forget the last group of rules + if len(currentUserAgents) > 0 { + rule := RobotsRule{ + UserAgents: make([]string, len(currentUserAgents)), + Disallows: make([]string, len(currentDisallows)), + Allows: make([]string, len(currentAllows)), + CrawlDelay: currentCrawlDelay, + } + copy(rule.UserAgents, currentUserAgents) + copy(rule.Disallows, currentDisallows) + copy(rule.Allows, currentAllows) + rules = append(rules, rule) } // Mark blacklisted user agents (those with "Disallow: /") @@ -211,10 +234,11 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule { var anubisRules []AnubisRule ruleCounter := 0 + // Process each robots rule individually for _, robotsRule := range robotsRules { - userAgent := robotsRule.UserAgent + userAgents := robotsRule.UserAgents - // Handle crawl delay as weight adjustment (do this first before any continues) + // Handle crawl delay if robotsRule.CrawlDelay > 0 && *crawlDelay > 0 { ruleCounter++ rule := AnubisRule{ @@ -223,20 +247,32 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule { Weight: &config.Weight{Adjust: *crawlDelay}, } - if userAgent == "*" { + if len(userAgents) == 1 && userAgents[0] == "*" { rule.Expression = &config.ExpressionOrList{ All: []string{"true"}, // Always applies } - } else { + } else if len(userAgents) == 1 { rule.Expression = &config.ExpressionOrList{ - All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)}, + All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgents[0])}, + } + } else { + // Multiple user agents - use any block + var expressions []string + for _, ua := range userAgents { + if ua == "*" { + expressions = append(expressions, "true") + } else { + expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua)) + } + } + rule.Expression = &config.ExpressionOrList{ + Any: expressions, } } - anubisRules = append(anubisRules, rule) } - // Handle blacklisted user agents (complete deny/challenge) + // Handle blacklisted user agents if robotsRule.IsBlacklist { ruleCounter++ rule := AnubisRule{ @@ -244,21 +280,36 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule { Action: *userAgentDeny, } - if userAgent == "*" { - // This would block everything - convert to a weight adjustment instead - rule.Name = fmt.Sprintf("%s-global-restriction-%d", *policyName, ruleCounter) - rule.Action = "WEIGH" - rule.Weight = &config.Weight{Adjust: 20} // Increase difficulty significantly - rule.Expression = &config.ExpressionOrList{ - All: []string{"true"}, // Always applies + if len(userAgents) == 1 { + userAgent := userAgents[0] + if userAgent == "*" { + // This would block everything - convert to a weight adjustment instead + rule.Name = fmt.Sprintf("%s-global-restriction-%d", *policyName, ruleCounter) + rule.Action = "WEIGH" + rule.Weight = &config.Weight{Adjust: 20} // Increase difficulty significantly + rule.Expression = &config.ExpressionOrList{ + All: []string{"true"}, // Always applies + } + } else { + rule.Expression = &config.ExpressionOrList{ + All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)}, + } } } else { + // Multiple user agents - use any block + var expressions []string + for _, ua := range userAgents { + if ua == "*" { + expressions = append(expressions, "true") + } else { + expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua)) + } + } rule.Expression = &config.ExpressionOrList{ - All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)}, + Any: expressions, } } anubisRules = append(anubisRules, rule) - continue } // Handle specific disallow rules @@ -276,9 +327,33 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule { // Build CEL expression var conditions []string - // Add user agent condition if not wildcard - if userAgent != "*" { - conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgent)) + // Add user agent conditions + if len(userAgents) == 1 && userAgents[0] == "*" { + // Wildcard user agent - no user agent condition needed + } else if len(userAgents) == 1 { + conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgents[0])) + } else { + // For multiple user agents, we need to use a more complex expression + // This is a limitation - we can't easily combine any for user agents with all for path + // So we'll create separate rules for each user agent + for _, ua := range userAgents { + if ua == "*" { + continue // Skip wildcard as it's handled separately + } + ruleCounter++ + subRule := AnubisRule{ + Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter), + Action: *baseAction, + Expression: &config.ExpressionOrList{ + All: []string{ + fmt.Sprintf("userAgent.contains(%q)", ua), + buildPathCondition(disallow), + }, + }, + } + anubisRules = append(anubisRules, subRule) + } + continue } // Add path condition @@ -291,7 +366,6 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule { anubisRules = append(anubisRules, rule) } - } return anubisRules diff --git a/cmd/robots2policy/robots2policy_test.go b/cmd/robots2policy/robots2policy_test.go index aa73f6b..e9d90e6 100644 --- a/cmd/robots2policy/robots2policy_test.go +++ b/cmd/robots2policy/robots2policy_test.go @@ -78,6 +78,12 @@ func TestDataFileConversion(t *testing.T) { expectedFile: "complex.yaml", options: TestOptions{format: "yaml", crawlDelayWeight: 5}, }, + { + name: "consecutive_user_agents", + robotsFile: "consecutive.robots.txt", + expectedFile: "consecutive.yaml", + options: TestOptions{format: "yaml", crawlDelayWeight: 3}, + }, } for _, tc := range testCases { diff --git a/cmd/robots2policy/testdata/blacklist.yaml b/cmd/robots2policy/testdata/blacklist.yaml index b22f06f..a3096f5 100644 --- a/cmd/robots2policy/testdata/blacklist.yaml +++ b/cmd/robots2policy/testdata/blacklist.yaml @@ -25,6 +25,6 @@ - action: CHALLENGE expression: all: - - userAgent.contains("Googlebot") - - path.startsWith("/search") - name: robots-txt-policy-disallow-7 \ No newline at end of file + - userAgent.contains("Googlebot") + - path.startsWith("/search") + name: robots-txt-policy-disallow-7 diff --git a/cmd/robots2policy/testdata/complex.yaml b/cmd/robots2policy/testdata/complex.yaml index 2eb0d19..6e677ad 100644 --- a/cmd/robots2policy/testdata/complex.yaml +++ b/cmd/robots2policy/testdata/complex.yaml @@ -20,8 +20,8 @@ - action: CHALLENGE expression: all: - - userAgent.contains("Googlebot") - - path.startsWith("/search/") + - userAgent.contains("Googlebot") + - path.startsWith("/search/") name: robots-txt-policy-disallow-6 - action: WEIGH expression: userAgent.contains("Bingbot") @@ -31,14 +31,14 @@ - action: CHALLENGE expression: all: - - userAgent.contains("Bingbot") - - path.startsWith("/search/") + - userAgent.contains("Bingbot") + - path.startsWith("/search/") name: robots-txt-policy-disallow-8 - action: CHALLENGE expression: all: - - userAgent.contains("Bingbot") - - path.startsWith("/admin/") + - userAgent.contains("Bingbot") + - path.startsWith("/admin/") name: robots-txt-policy-disallow-9 - action: DENY expression: userAgent.contains("BadBot") @@ -54,18 +54,18 @@ - action: CHALLENGE expression: all: - - userAgent.contains("TestBot") - - path.matches("^/.*/admin") + - userAgent.contains("TestBot") + - path.matches("^/.*/admin") name: robots-txt-policy-disallow-13 - action: CHALLENGE expression: all: - - userAgent.contains("TestBot") - - path.matches("^/temp.*\\.html") + - userAgent.contains("TestBot") + - path.matches("^/temp.*\\.html") name: robots-txt-policy-disallow-14 - action: CHALLENGE expression: all: - - userAgent.contains("TestBot") - - path.matches("^/file.\\.log") + - userAgent.contains("TestBot") + - path.matches("^/file.\\.log") name: robots-txt-policy-disallow-15 diff --git a/cmd/robots2policy/testdata/consecutive.robots.txt b/cmd/robots2policy/testdata/consecutive.robots.txt new file mode 100644 index 0000000..e4f6cb5 --- /dev/null +++ b/cmd/robots2policy/testdata/consecutive.robots.txt @@ -0,0 +1,25 @@ +# Test consecutive user agents that should be grouped into any: blocks +User-agent: * +Disallow: /admin +Crawl-delay: 10 + +# Multiple consecutive user agents - should be grouped +User-agent: BadBot +User-agent: SpamBot +User-agent: EvilBot +Disallow: / + +# Single user agent - should be separate +User-agent: GoodBot +Disallow: /private + +# Multiple consecutive user agents with crawl delay +User-agent: SlowBot1 +User-agent: SlowBot2 +Crawl-delay: 5 + +# Multiple consecutive user agents with specific path +User-agent: SearchBot1 +User-agent: SearchBot2 +User-agent: SearchBot3 +Disallow: /search \ No newline at end of file diff --git a/cmd/robots2policy/testdata/consecutive.yaml b/cmd/robots2policy/testdata/consecutive.yaml new file mode 100644 index 0000000..144abda --- /dev/null +++ b/cmd/robots2policy/testdata/consecutive.yaml @@ -0,0 +1,47 @@ +- action: WEIGH + expression: "true" + name: robots-txt-policy-crawl-delay-1 + weight: + adjust: 3 +- action: CHALLENGE + expression: path.startsWith("/admin") + name: robots-txt-policy-disallow-2 +- action: DENY + expression: + any: + - userAgent.contains("BadBot") + - userAgent.contains("SpamBot") + - userAgent.contains("EvilBot") + name: robots-txt-policy-blacklist-3 +- action: CHALLENGE + expression: + all: + - userAgent.contains("GoodBot") + - path.startsWith("/private") + name: robots-txt-policy-disallow-4 +- action: WEIGH + expression: + any: + - userAgent.contains("SlowBot1") + - userAgent.contains("SlowBot2") + name: robots-txt-policy-crawl-delay-5 + weight: + adjust: 3 +- action: CHALLENGE + expression: + all: + - userAgent.contains("SearchBot1") + - path.startsWith("/search") + name: robots-txt-policy-disallow-7 +- action: CHALLENGE + expression: + all: + - userAgent.contains("SearchBot2") + - path.startsWith("/search") + name: robots-txt-policy-disallow-8 +- action: CHALLENGE + expression: + all: + - userAgent.contains("SearchBot3") + - path.startsWith("/search") + name: robots-txt-policy-disallow-9 diff --git a/cmd/robots2policy/testdata/simple.json b/cmd/robots2policy/testdata/simple.json index 20bdf0d..c8e1de0 100644 --- a/cmd/robots2policy/testdata/simple.json +++ b/cmd/robots2policy/testdata/simple.json @@ -1,12 +1,12 @@ [ { - "action": "CHALLENGE", "expression": "path.startsWith(\"/admin/\")", - "name": "robots-txt-policy-disallow-1" + "name": "robots-txt-policy-disallow-1", + "action": "CHALLENGE" }, { - "action": "CHALLENGE", "expression": "path.startsWith(\"/private\")", - "name": "robots-txt-policy-disallow-2" + "name": "robots-txt-policy-disallow-2", + "action": "CHALLENGE" } ] \ No newline at end of file diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md index 633f9eb..5474051 100644 --- a/docs/docs/CHANGELOG.md +++ b/docs/docs/CHANGELOG.md @@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - The [Thoth client](https://anubis.techaro.lol/docs/admin/thoth) is now public in the repo instead of being an internal package. - [Custom-AsyncHttpClient](https://github.com/AsyncHttpClient/async-http-client)'s default User-Agent has an increased weight by default ([#852](https://github.com/TecharoHQ/anubis/issues/852)). - The [`segments`](./admin/configuration/expressions.mdx#segments) function was added for splitting a path into its slash-separated segments. +- Fixed `robots2policy` to properly group consecutive user agents into `any:` instead of only processing the last one ## v1.21.3: Minfilia Warde - Echo 3