From e51b4bd9657349722b36944e7ec0088c2243c7bc Mon Sep 17 00:00:00 2001 From: Jason Cameron Date: Sat, 26 Jul 2025 20:34:04 -0400 Subject: [PATCH] fix: collate --- cmd/robots2policy/main.go | 321 ++++++++---------- cmd/robots2policy/robots2policy_test.go | 6 + cmd/robots2policy/testdata/blacklist.yaml | 30 +- cmd/robots2policy/testdata/complex.yaml | 93 ++--- .../testdata/consecutive.robots.txt | 25 ++ cmd/robots2policy/testdata/consecutive.yaml | 47 +++ cmd/robots2policy/testdata/wildcards.yaml | 6 +- 7 files changed, 280 insertions(+), 248 deletions(-) create mode 100644 cmd/robots2policy/testdata/consecutive.robots.txt create mode 100644 cmd/robots2policy/testdata/consecutive.yaml diff --git a/cmd/robots2policy/main.go b/cmd/robots2policy/main.go index 8928e3c..9bd2c9c 100644 --- a/cmd/robots2policy/main.go +++ b/cmd/robots2policy/main.go @@ -10,7 +10,6 @@ import ( "net/http" "os" "regexp" - "sort" "strings" "github.com/TecharoHQ/anubis/lib/policy/config" @@ -30,7 +29,7 @@ var ( ) type RobotsRule struct { - UserAgent string + UserAgents []string Disallows []string Allows []string CrawlDelay int @@ -161,17 +160,16 @@ func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) { // If we have accumulated rules with directives and encounter a new user-agent, // flush the current rules if len(currentUserAgents) > 0 && (len(currentDisallows) > 0 || len(currentAllows) > 0 || currentCrawlDelay > 0) { - for _, userAgent := range currentUserAgents { - rule := RobotsRule{ - UserAgent: userAgent, - Disallows: make([]string, len(currentDisallows)), - Allows: make([]string, len(currentAllows)), - CrawlDelay: currentCrawlDelay, - } - copy(rule.Disallows, currentDisallows) - copy(rule.Allows, currentAllows) - rules = append(rules, rule) + rule := RobotsRule{ + UserAgents: make([]string, len(currentUserAgents)), + Disallows: make([]string, len(currentDisallows)), + Allows: make([]string, len(currentAllows)), + CrawlDelay: currentCrawlDelay, } + copy(rule.UserAgents, currentUserAgents) + copy(rule.Disallows, currentDisallows) + copy(rule.Allows, currentAllows) + rules = append(rules, rule) // Reset for next group currentUserAgents = nil currentDisallows = nil @@ -201,17 +199,16 @@ func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) { // Don't forget the last group of rules if len(currentUserAgents) > 0 { - for _, userAgent := range currentUserAgents { - rule := RobotsRule{ - UserAgent: userAgent, - Disallows: make([]string, len(currentDisallows)), - Allows: make([]string, len(currentAllows)), - CrawlDelay: currentCrawlDelay, - } - copy(rule.Disallows, currentDisallows) - copy(rule.Allows, currentAllows) - rules = append(rules, rule) + rule := RobotsRule{ + UserAgents: make([]string, len(currentUserAgents)), + Disallows: make([]string, len(currentDisallows)), + Allows: make([]string, len(currentAllows)), + CrawlDelay: currentCrawlDelay, } + copy(rule.UserAgents, currentUserAgents) + copy(rule.Disallows, currentDisallows) + copy(rule.Allows, currentAllows) + rules = append(rules, rule) } // Mark blacklisted user agents (those with "Disallow: /") @@ -237,24 +234,82 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule { var anubisRules []AnubisRule ruleCounter := 0 - // Group rules by their directives to create any blocks - blacklistGroups := make(map[string][]string) // key: directive signature, value: user agents - disallowGroups := make(map[string][]string) // key: path, value: user agents - crawlDelayGroups := make(map[string][]string) // key: delay, value: user agents - + // Process each robots rule individually for _, robotsRule := range robotsRules { - userAgent := robotsRule.UserAgent + userAgents := robotsRule.UserAgents - // Handle crawl delay groups + // Handle crawl delay if robotsRule.CrawlDelay > 0 && *crawlDelay > 0 { - key := fmt.Sprintf("delay-%d", robotsRule.CrawlDelay) - crawlDelayGroups[key] = append(crawlDelayGroups[key], userAgent) + ruleCounter++ + rule := AnubisRule{ + Name: fmt.Sprintf("%s-crawl-delay-%d", *policyName, ruleCounter), + Action: "WEIGH", + Weight: &config.Weight{Adjust: *crawlDelay}, + } + + if len(userAgents) == 1 && userAgents[0] == "*" { + rule.Expression = &config.ExpressionOrList{ + All: []string{"true"}, // Always applies + } + } else if len(userAgents) == 1 { + rule.Expression = &config.ExpressionOrList{ + All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgents[0])}, + } + } else { + // Multiple user agents - use any block + var expressions []string + for _, ua := range userAgents { + if ua == "*" { + expressions = append(expressions, "true") + } else { + expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua)) + } + } + rule.Expression = &config.ExpressionOrList{ + Any: expressions, + } + } + anubisRules = append(anubisRules, rule) } // Handle blacklisted user agents if robotsRule.IsBlacklist { - key := "blacklist" - blacklistGroups[key] = append(blacklistGroups[key], userAgent) + ruleCounter++ + rule := AnubisRule{ + Name: fmt.Sprintf("%s-blacklist-%d", *policyName, ruleCounter), + Action: *userAgentDeny, + } + + if len(userAgents) == 1 { + userAgent := userAgents[0] + if userAgent == "*" { + // This would block everything - convert to a weight adjustment instead + rule.Name = fmt.Sprintf("%s-global-restriction-%d", *policyName, ruleCounter) + rule.Action = "WEIGH" + rule.Weight = &config.Weight{Adjust: 20} // Increase difficulty significantly + rule.Expression = &config.ExpressionOrList{ + All: []string{"true"}, // Always applies + } + } else { + rule.Expression = &config.ExpressionOrList{ + All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)}, + } + } + } else { + // Multiple user agents - use any block + var expressions []string + for _, ua := range userAgents { + if ua == "*" { + expressions = append(expressions, "true") + } else { + expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua)) + } + } + rule.Expression = &config.ExpressionOrList{ + Any: expressions, + } + } + anubisRules = append(anubisRules, rule) } // Handle specific disallow rules @@ -262,168 +317,64 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule { if disallow == "/" { continue // Already handled as blacklist above } - disallowGroups[disallow] = append(disallowGroups[disallow], userAgent) - } - } - // Generate rules for crawl delays - // Sort keys for deterministic order - var crawlDelayKeys []string - for key := range crawlDelayGroups { - crawlDelayKeys = append(crawlDelayKeys, key) - } - sort.Strings(crawlDelayKeys) - - for _, key := range crawlDelayKeys { - userAgents := crawlDelayGroups[key] - ruleCounter++ - rule := AnubisRule{ - Name: fmt.Sprintf("%s-crawl-delay-%d", *policyName, ruleCounter), - Action: "WEIGH", - Weight: &config.Weight{Adjust: *crawlDelay}, - } - - if len(userAgents) == 1 && userAgents[0] == "*" { - rule.Expression = &config.ExpressionOrList{ - All: []string{"true"}, // Always applies + ruleCounter++ + rule := AnubisRule{ + Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter), + Action: *baseAction, } - } else if len(userAgents) == 1 { - rule.Expression = &config.ExpressionOrList{ - All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgents[0])}, - } - } else { - // Multiple user agents - use any block - var expressions []string - for _, ua := range userAgents { - if ua == "*" { - expressions = append(expressions, "true") - } else { - expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua)) - } - } - rule.Expression = &config.ExpressionOrList{ - Any: expressions, - } - } - anubisRules = append(anubisRules, rule) - } + // Build CEL expression + var conditions []string - // Generate rules for blacklisted user agents - // Sort keys for deterministic order - var blacklistKeys []string - for key := range blacklistGroups { - blacklistKeys = append(blacklistKeys, key) - } - sort.Strings(blacklistKeys) - - for _, key := range blacklistKeys { - userAgents := blacklistGroups[key] - ruleCounter++ - rule := AnubisRule{ - Name: fmt.Sprintf("%s-blacklist-%d", *policyName, ruleCounter), - Action: *userAgentDeny, - } - - if len(userAgents) == 1 { - userAgent := userAgents[0] - if userAgent == "*" { - // This would block everything - convert to a weight adjustment instead - rule.Name = fmt.Sprintf("%s-global-restriction-%d", *policyName, ruleCounter) - rule.Action = "WEIGH" - rule.Weight = &config.Weight{Adjust: 20} // Increase difficulty significantly - rule.Expression = &config.ExpressionOrList{ - All: []string{"true"}, // Always applies - } + // Add user agent conditions + if len(userAgents) == 1 && userAgents[0] == "*" { + // Wildcard user agent - no user agent condition needed + } else if len(userAgents) == 1 { + conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgents[0])) } else { - rule.Expression = &config.ExpressionOrList{ - All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)}, + // Multiple user agents - use any block for user agents + var uaExpressions []string + for _, ua := range userAgents { + if ua == "*" { + uaExpressions = append(uaExpressions, "true") + } else { + uaExpressions = append(uaExpressions, fmt.Sprintf("userAgent.contains(%q)", ua)) + } } - } - } else { - // Multiple user agents - use any block - var expressions []string - for _, ua := range userAgents { - if ua == "*" { - expressions = append(expressions, "true") - } else { - expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua)) - } - } - rule.Expression = &config.ExpressionOrList{ - Any: expressions, - } - } - - anubisRules = append(anubisRules, rule) - } - - // Generate rules for specific disallow paths - // Sort keys for deterministic order - var disallowKeys []string - for key := range disallowGroups { - disallowKeys = append(disallowKeys, key) - } - sort.Strings(disallowKeys) - - for _, path := range disallowKeys { - userAgents := disallowGroups[path] - ruleCounter++ - rule := AnubisRule{ - Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter), - Action: *baseAction, - } - - // Build CEL expression - var conditions []string - - // Add user agent conditions - if len(userAgents) == 1 && userAgents[0] == "*" { - // Wildcard user agent - no user agent condition needed - } else if len(userAgents) == 1 { - conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgents[0])) - } else { - // Multiple user agents - use any block for user agents - var uaExpressions []string - for _, ua := range userAgents { - if ua == "*" { - uaExpressions = append(uaExpressions, "true") - } else { - uaExpressions = append(uaExpressions, fmt.Sprintf("userAgent.contains(%q)", ua)) - } - } - // For multiple user agents, we need to use a more complex expression - // This is a limitation - we can't easily combine any for user agents with all for path - // So we'll create separate rules for each user agent - for _, ua := range userAgents { - if ua == "*" { - continue // Skip wildcard as it's handled separately - } - ruleCounter++ - subRule := AnubisRule{ - Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter), - Action: *baseAction, - Expression: &config.ExpressionOrList{ - All: []string{ - fmt.Sprintf("userAgent.contains(%q)", ua), - buildPathCondition(path), + // For multiple user agents, we need to use a more complex expression + // This is a limitation - we can't easily combine any for user agents with all for path + // So we'll create separate rules for each user agent + for _, ua := range userAgents { + if ua == "*" { + continue // Skip wildcard as it's handled separately + } + ruleCounter++ + subRule := AnubisRule{ + Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter), + Action: *baseAction, + Expression: &config.ExpressionOrList{ + All: []string{ + fmt.Sprintf("userAgent.contains(%q)", ua), + buildPathCondition(disallow), + }, }, - }, + } + anubisRules = append(anubisRules, subRule) } - anubisRules = append(anubisRules, subRule) + continue } - continue + + // Add path condition + pathCondition := buildPathCondition(disallow) + conditions = append(conditions, pathCondition) + + rule.Expression = &config.ExpressionOrList{ + All: conditions, + } + + anubisRules = append(anubisRules, rule) } - - // Add path condition - pathCondition := buildPathCondition(path) - conditions = append(conditions, pathCondition) - - rule.Expression = &config.ExpressionOrList{ - All: conditions, - } - - anubisRules = append(anubisRules, rule) } return anubisRules diff --git a/cmd/robots2policy/robots2policy_test.go b/cmd/robots2policy/robots2policy_test.go index aa73f6b..e9d90e6 100644 --- a/cmd/robots2policy/robots2policy_test.go +++ b/cmd/robots2policy/robots2policy_test.go @@ -78,6 +78,12 @@ func TestDataFileConversion(t *testing.T) { expectedFile: "complex.yaml", options: TestOptions{format: "yaml", crawlDelayWeight: 5}, }, + { + name: "consecutive_user_agents", + robotsFile: "consecutive.robots.txt", + expectedFile: "consecutive.yaml", + options: TestOptions{format: "yaml", crawlDelayWeight: 3}, + }, } for _, tc := range testCases { diff --git a/cmd/robots2policy/testdata/blacklist.yaml b/cmd/robots2policy/testdata/blacklist.yaml index b0a93d1..a3096f5 100644 --- a/cmd/robots2policy/testdata/blacklist.yaml +++ b/cmd/robots2policy/testdata/blacklist.yaml @@ -3,28 +3,28 @@ name: robots-txt-policy-crawl-delay-1 weight: adjust: 3 -- action: WEIGH - expression: userAgent.contains("Googlebot") - name: robots-txt-policy-crawl-delay-2 - weight: - adjust: 3 +- action: CHALLENGE + expression: path.startsWith("/admin") + name: robots-txt-policy-disallow-2 +- action: DENY + expression: userAgent.contains("BadBot") + name: robots-txt-policy-blacklist-3 - action: WEIGH expression: userAgent.contains("SpamBot") - name: robots-txt-policy-crawl-delay-3 + name: robots-txt-policy-crawl-delay-4 weight: adjust: 3 - action: DENY - expression: - any: - - userAgent.contains("BadBot") - - userAgent.contains("SpamBot") - name: robots-txt-policy-blacklist-4 -- action: CHALLENGE - expression: path.startsWith("/admin") - name: robots-txt-policy-disallow-5 + expression: userAgent.contains("SpamBot") + name: robots-txt-policy-blacklist-5 +- action: WEIGH + expression: userAgent.contains("Googlebot") + name: robots-txt-policy-crawl-delay-6 + weight: + adjust: 3 - action: CHALLENGE expression: all: - userAgent.contains("Googlebot") - path.startsWith("/search") - name: robots-txt-policy-disallow-6 + name: robots-txt-policy-disallow-7 diff --git a/cmd/robots2policy/testdata/complex.yaml b/cmd/robots2policy/testdata/complex.yaml index 1f13be3..6e677ad 100644 --- a/cmd/robots2policy/testdata/complex.yaml +++ b/cmd/robots2policy/testdata/complex.yaml @@ -1,68 +1,71 @@ - action: WEIGH - expression: userAgent.contains("Bingbot") + expression: "true" name: robots-txt-policy-crawl-delay-1 weight: adjust: 5 -- action: WEIGH - expression: userAgent.contains("Googlebot") - name: robots-txt-policy-crawl-delay-2 - weight: - adjust: 5 -- action: WEIGH - expression: userAgent.contains("SeoBot") - name: robots-txt-policy-crawl-delay-3 - weight: - adjust: 5 -- action: WEIGH - expression: "true" - name: robots-txt-policy-crawl-delay-4 - weight: - adjust: 5 -- action: DENY - expression: - any: - - userAgent.contains("BadBot") - - userAgent.contains("SeoBot") - name: robots-txt-policy-blacklist-5 - action: CHALLENGE - expression: - all: - - userAgent.contains("TestBot") - - path.matches("^/.*/admin") - name: robots-txt-policy-disallow-6 -- action: CHALLENGE - expression: - all: - - userAgent.contains("Bingbot") - - path.startsWith("/admin/") - name: robots-txt-policy-disallow-8 -- action: CHALLENGE - expression: path.startsWith("/api/internal/") - name: robots-txt-policy-disallow-9 -- action: CHALLENGE - expression: - all: - - userAgent.contains("TestBot") - - path.matches("^/file.\\.log") - name: robots-txt-policy-disallow-10 + expression: path.startsWith("/admin/") + name: robots-txt-policy-disallow-2 - action: CHALLENGE expression: path.startsWith("/private/") - name: robots-txt-policy-disallow-11 + name: robots-txt-policy-disallow-3 +- action: CHALLENGE + expression: path.startsWith("/api/internal/") + name: robots-txt-policy-disallow-4 +- action: WEIGH + expression: userAgent.contains("Googlebot") + name: robots-txt-policy-crawl-delay-5 + weight: + adjust: 5 - action: CHALLENGE expression: all: - userAgent.contains("Googlebot") - path.startsWith("/search/") - name: robots-txt-policy-disallow-13 + name: robots-txt-policy-disallow-6 +- action: WEIGH + expression: userAgent.contains("Bingbot") + name: robots-txt-policy-crawl-delay-7 + weight: + adjust: 5 - action: CHALLENGE expression: all: - userAgent.contains("Bingbot") - path.startsWith("/search/") - name: robots-txt-policy-disallow-14 + name: robots-txt-policy-disallow-8 +- action: CHALLENGE + expression: + all: + - userAgent.contains("Bingbot") + - path.startsWith("/admin/") + name: robots-txt-policy-disallow-9 +- action: DENY + expression: userAgent.contains("BadBot") + name: robots-txt-policy-blacklist-10 +- action: WEIGH + expression: userAgent.contains("SeoBot") + name: robots-txt-policy-crawl-delay-11 + weight: + adjust: 5 +- action: DENY + expression: userAgent.contains("SeoBot") + name: robots-txt-policy-blacklist-12 +- action: CHALLENGE + expression: + all: + - userAgent.contains("TestBot") + - path.matches("^/.*/admin") + name: robots-txt-policy-disallow-13 - action: CHALLENGE expression: all: - userAgent.contains("TestBot") - path.matches("^/temp.*\\.html") + name: robots-txt-policy-disallow-14 +- action: CHALLENGE + expression: + all: + - userAgent.contains("TestBot") + - path.matches("^/file.\\.log") name: robots-txt-policy-disallow-15 diff --git a/cmd/robots2policy/testdata/consecutive.robots.txt b/cmd/robots2policy/testdata/consecutive.robots.txt new file mode 100644 index 0000000..e4f6cb5 --- /dev/null +++ b/cmd/robots2policy/testdata/consecutive.robots.txt @@ -0,0 +1,25 @@ +# Test consecutive user agents that should be grouped into any: blocks +User-agent: * +Disallow: /admin +Crawl-delay: 10 + +# Multiple consecutive user agents - should be grouped +User-agent: BadBot +User-agent: SpamBot +User-agent: EvilBot +Disallow: / + +# Single user agent - should be separate +User-agent: GoodBot +Disallow: /private + +# Multiple consecutive user agents with crawl delay +User-agent: SlowBot1 +User-agent: SlowBot2 +Crawl-delay: 5 + +# Multiple consecutive user agents with specific path +User-agent: SearchBot1 +User-agent: SearchBot2 +User-agent: SearchBot3 +Disallow: /search \ No newline at end of file diff --git a/cmd/robots2policy/testdata/consecutive.yaml b/cmd/robots2policy/testdata/consecutive.yaml new file mode 100644 index 0000000..144abda --- /dev/null +++ b/cmd/robots2policy/testdata/consecutive.yaml @@ -0,0 +1,47 @@ +- action: WEIGH + expression: "true" + name: robots-txt-policy-crawl-delay-1 + weight: + adjust: 3 +- action: CHALLENGE + expression: path.startsWith("/admin") + name: robots-txt-policy-disallow-2 +- action: DENY + expression: + any: + - userAgent.contains("BadBot") + - userAgent.contains("SpamBot") + - userAgent.contains("EvilBot") + name: robots-txt-policy-blacklist-3 +- action: CHALLENGE + expression: + all: + - userAgent.contains("GoodBot") + - path.startsWith("/private") + name: robots-txt-policy-disallow-4 +- action: WEIGH + expression: + any: + - userAgent.contains("SlowBot1") + - userAgent.contains("SlowBot2") + name: robots-txt-policy-crawl-delay-5 + weight: + adjust: 3 +- action: CHALLENGE + expression: + all: + - userAgent.contains("SearchBot1") + - path.startsWith("/search") + name: robots-txt-policy-disallow-7 +- action: CHALLENGE + expression: + all: + - userAgent.contains("SearchBot2") + - path.startsWith("/search") + name: robots-txt-policy-disallow-8 +- action: CHALLENGE + expression: + all: + - userAgent.contains("SearchBot3") + - path.startsWith("/search") + name: robots-txt-policy-disallow-9 diff --git a/cmd/robots2policy/testdata/wildcards.yaml b/cmd/robots2policy/testdata/wildcards.yaml index 85be302..ff51578 100644 --- a/cmd/robots2policy/testdata/wildcards.yaml +++ b/cmd/robots2policy/testdata/wildcards.yaml @@ -1,12 +1,12 @@ - action: CHALLENGE - expression: path.matches("^/.*/private") + expression: path.matches("^/search.*") name: robots-txt-policy-disallow-1 - action: CHALLENGE - expression: path.matches("^/admin/.*.action=delete") + expression: path.matches("^/.*/private") name: robots-txt-policy-disallow-2 - action: CHALLENGE expression: path.matches("^/file.\\.txt") name: robots-txt-policy-disallow-3 - action: CHALLENGE - expression: path.matches("^/search.*") + expression: path.matches("^/admin/.*.action=delete") name: robots-txt-policy-disallow-4