From 291ed2a084cf5b1a2257a0b2f313729b5532c4d1 Mon Sep 17 00:00:00 2001 From: Jason Cameron Date: Sat, 26 Jul 2025 20:27:00 -0400 Subject: [PATCH] fix: handle multiple user agents --- cmd/robots2policy/main.go | 276 ++++++++++++++++------ cmd/robots2policy/testdata/blacklist.yaml | 36 +-- cmd/robots2policy/testdata/complex.yaml | 81 +++---- cmd/robots2policy/testdata/simple.json | 8 +- cmd/robots2policy/testdata/wildcards.yaml | 8 +- 5 files changed, 269 insertions(+), 140 deletions(-) diff --git a/cmd/robots2policy/main.go b/cmd/robots2policy/main.go index eaa4d7f..8928e3c 100644 --- a/cmd/robots2policy/main.go +++ b/cmd/robots2policy/main.go @@ -10,6 +10,7 @@ import ( "net/http" "os" "regexp" + "sort" "strings" "github.com/TecharoHQ/anubis/lib/policy/config" @@ -133,7 +134,10 @@ func main() { func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) { scanner := bufio.NewScanner(input) var rules []RobotsRule - var currentRule *RobotsRule + var currentUserAgents []string + var currentDisallows []string + var currentAllows []string + var currentCrawlDelay int for scanner.Scan() { line := strings.TrimSpace(scanner.Text()) @@ -154,38 +158,60 @@ func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) { switch directive { case "user-agent": - // Start a new rule section - if currentRule != nil { - rules = append(rules, *currentRule) - } - currentRule = &RobotsRule{ - UserAgent: value, - Disallows: make([]string, 0), - Allows: make([]string, 0), + // If we have accumulated rules with directives and encounter a new user-agent, + // flush the current rules + if len(currentUserAgents) > 0 && (len(currentDisallows) > 0 || len(currentAllows) > 0 || currentCrawlDelay > 0) { + for _, userAgent := range currentUserAgents { + rule := RobotsRule{ + UserAgent: userAgent, + Disallows: make([]string, len(currentDisallows)), + Allows: make([]string, len(currentAllows)), + CrawlDelay: currentCrawlDelay, + } + copy(rule.Disallows, currentDisallows) + copy(rule.Allows, currentAllows) + rules = append(rules, rule) + } + // Reset for next group + currentUserAgents = nil + currentDisallows = nil + currentAllows = nil + currentCrawlDelay = 0 } + currentUserAgents = append(currentUserAgents, value) case "disallow": - if currentRule != nil && value != "" { - currentRule.Disallows = append(currentRule.Disallows, value) + if len(currentUserAgents) > 0 && value != "" { + currentDisallows = append(currentDisallows, value) } case "allow": - if currentRule != nil && value != "" { - currentRule.Allows = append(currentRule.Allows, value) + if len(currentUserAgents) > 0 && value != "" { + currentAllows = append(currentAllows, value) } case "crawl-delay": - if currentRule != nil { + if len(currentUserAgents) > 0 { if delay, err := parseIntSafe(value); err == nil { - currentRule.CrawlDelay = delay + currentCrawlDelay = delay } } } } - // Don't forget the last rule - if currentRule != nil { - rules = append(rules, *currentRule) + // Don't forget the last group of rules + if len(currentUserAgents) > 0 { + for _, userAgent := range currentUserAgents { + rule := RobotsRule{ + UserAgent: userAgent, + Disallows: make([]string, len(currentDisallows)), + Allows: make([]string, len(currentAllows)), + CrawlDelay: currentCrawlDelay, + } + copy(rule.Disallows, currentDisallows) + copy(rule.Allows, currentAllows) + rules = append(rules, rule) + } } // Mark blacklisted user agents (those with "Disallow: /") @@ -211,39 +237,96 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule { var anubisRules []AnubisRule ruleCounter := 0 + // Group rules by their directives to create any blocks + blacklistGroups := make(map[string][]string) // key: directive signature, value: user agents + disallowGroups := make(map[string][]string) // key: path, value: user agents + crawlDelayGroups := make(map[string][]string) // key: delay, value: user agents + for _, robotsRule := range robotsRules { userAgent := robotsRule.UserAgent - // Handle crawl delay as weight adjustment (do this first before any continues) + // Handle crawl delay groups if robotsRule.CrawlDelay > 0 && *crawlDelay > 0 { - ruleCounter++ - rule := AnubisRule{ - Name: fmt.Sprintf("%s-crawl-delay-%d", *policyName, ruleCounter), - Action: "WEIGH", - Weight: &config.Weight{Adjust: *crawlDelay}, - } - - if userAgent == "*" { - rule.Expression = &config.ExpressionOrList{ - All: []string{"true"}, // Always applies - } - } else { - rule.Expression = &config.ExpressionOrList{ - All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)}, - } - } - - anubisRules = append(anubisRules, rule) + key := fmt.Sprintf("delay-%d", robotsRule.CrawlDelay) + crawlDelayGroups[key] = append(crawlDelayGroups[key], userAgent) } - // Handle blacklisted user agents (complete deny/challenge) + // Handle blacklisted user agents if robotsRule.IsBlacklist { - ruleCounter++ - rule := AnubisRule{ - Name: fmt.Sprintf("%s-blacklist-%d", *policyName, ruleCounter), - Action: *userAgentDeny, - } + key := "blacklist" + blacklistGroups[key] = append(blacklistGroups[key], userAgent) + } + // Handle specific disallow rules + for _, disallow := range robotsRule.Disallows { + if disallow == "/" { + continue // Already handled as blacklist above + } + disallowGroups[disallow] = append(disallowGroups[disallow], userAgent) + } + } + + // Generate rules for crawl delays + // Sort keys for deterministic order + var crawlDelayKeys []string + for key := range crawlDelayGroups { + crawlDelayKeys = append(crawlDelayKeys, key) + } + sort.Strings(crawlDelayKeys) + + for _, key := range crawlDelayKeys { + userAgents := crawlDelayGroups[key] + ruleCounter++ + rule := AnubisRule{ + Name: fmt.Sprintf("%s-crawl-delay-%d", *policyName, ruleCounter), + Action: "WEIGH", + Weight: &config.Weight{Adjust: *crawlDelay}, + } + + if len(userAgents) == 1 && userAgents[0] == "*" { + rule.Expression = &config.ExpressionOrList{ + All: []string{"true"}, // Always applies + } + } else if len(userAgents) == 1 { + rule.Expression = &config.ExpressionOrList{ + All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgents[0])}, + } + } else { + // Multiple user agents - use any block + var expressions []string + for _, ua := range userAgents { + if ua == "*" { + expressions = append(expressions, "true") + } else { + expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua)) + } + } + rule.Expression = &config.ExpressionOrList{ + Any: expressions, + } + } + + anubisRules = append(anubisRules, rule) + } + + // Generate rules for blacklisted user agents + // Sort keys for deterministic order + var blacklistKeys []string + for key := range blacklistGroups { + blacklistKeys = append(blacklistKeys, key) + } + sort.Strings(blacklistKeys) + + for _, key := range blacklistKeys { + userAgents := blacklistGroups[key] + ruleCounter++ + rule := AnubisRule{ + Name: fmt.Sprintf("%s-blacklist-%d", *policyName, ruleCounter), + Action: *userAgentDeny, + } + + if len(userAgents) == 1 { + userAgent := userAgents[0] if userAgent == "*" { // This would block everything - convert to a weight adjustment instead rule.Name = fmt.Sprintf("%s-global-restriction-%d", *policyName, ruleCounter) @@ -257,41 +340,90 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule { All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)}, } } - anubisRules = append(anubisRules, rule) + } else { + // Multiple user agents - use any block + var expressions []string + for _, ua := range userAgents { + if ua == "*" { + expressions = append(expressions, "true") + } else { + expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua)) + } + } + rule.Expression = &config.ExpressionOrList{ + Any: expressions, + } + } + + anubisRules = append(anubisRules, rule) + } + + // Generate rules for specific disallow paths + // Sort keys for deterministic order + var disallowKeys []string + for key := range disallowGroups { + disallowKeys = append(disallowKeys, key) + } + sort.Strings(disallowKeys) + + for _, path := range disallowKeys { + userAgents := disallowGroups[path] + ruleCounter++ + rule := AnubisRule{ + Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter), + Action: *baseAction, + } + + // Build CEL expression + var conditions []string + + // Add user agent conditions + if len(userAgents) == 1 && userAgents[0] == "*" { + // Wildcard user agent - no user agent condition needed + } else if len(userAgents) == 1 { + conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgents[0])) + } else { + // Multiple user agents - use any block for user agents + var uaExpressions []string + for _, ua := range userAgents { + if ua == "*" { + uaExpressions = append(uaExpressions, "true") + } else { + uaExpressions = append(uaExpressions, fmt.Sprintf("userAgent.contains(%q)", ua)) + } + } + // For multiple user agents, we need to use a more complex expression + // This is a limitation - we can't easily combine any for user agents with all for path + // So we'll create separate rules for each user agent + for _, ua := range userAgents { + if ua == "*" { + continue // Skip wildcard as it's handled separately + } + ruleCounter++ + subRule := AnubisRule{ + Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter), + Action: *baseAction, + Expression: &config.ExpressionOrList{ + All: []string{ + fmt.Sprintf("userAgent.contains(%q)", ua), + buildPathCondition(path), + }, + }, + } + anubisRules = append(anubisRules, subRule) + } continue } - // Handle specific disallow rules - for _, disallow := range robotsRule.Disallows { - if disallow == "/" { - continue // Already handled as blacklist above - } + // Add path condition + pathCondition := buildPathCondition(path) + conditions = append(conditions, pathCondition) - ruleCounter++ - rule := AnubisRule{ - Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter), - Action: *baseAction, - } - - // Build CEL expression - var conditions []string - - // Add user agent condition if not wildcard - if userAgent != "*" { - conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgent)) - } - - // Add path condition - pathCondition := buildPathCondition(disallow) - conditions = append(conditions, pathCondition) - - rule.Expression = &config.ExpressionOrList{ - All: conditions, - } - - anubisRules = append(anubisRules, rule) + rule.Expression = &config.ExpressionOrList{ + All: conditions, } + anubisRules = append(anubisRules, rule) } return anubisRules diff --git a/cmd/robots2policy/testdata/blacklist.yaml b/cmd/robots2policy/testdata/blacklist.yaml index b22f06f..b0a93d1 100644 --- a/cmd/robots2policy/testdata/blacklist.yaml +++ b/cmd/robots2policy/testdata/blacklist.yaml @@ -3,28 +3,28 @@ name: robots-txt-policy-crawl-delay-1 weight: adjust: 3 -- action: CHALLENGE - expression: path.startsWith("/admin") - name: robots-txt-policy-disallow-2 -- action: DENY - expression: userAgent.contains("BadBot") - name: robots-txt-policy-blacklist-3 -- action: WEIGH - expression: userAgent.contains("SpamBot") - name: robots-txt-policy-crawl-delay-4 - weight: - adjust: 3 -- action: DENY - expression: userAgent.contains("SpamBot") - name: robots-txt-policy-blacklist-5 - action: WEIGH expression: userAgent.contains("Googlebot") - name: robots-txt-policy-crawl-delay-6 + name: robots-txt-policy-crawl-delay-2 weight: adjust: 3 +- action: WEIGH + expression: userAgent.contains("SpamBot") + name: robots-txt-policy-crawl-delay-3 + weight: + adjust: 3 +- action: DENY + expression: + any: + - userAgent.contains("BadBot") + - userAgent.contains("SpamBot") + name: robots-txt-policy-blacklist-4 +- action: CHALLENGE + expression: path.startsWith("/admin") + name: robots-txt-policy-disallow-5 - action: CHALLENGE expression: all: - - userAgent.contains("Googlebot") - - path.startsWith("/search") - name: robots-txt-policy-disallow-7 \ No newline at end of file + - userAgent.contains("Googlebot") + - path.startsWith("/search") + name: robots-txt-policy-disallow-6 diff --git a/cmd/robots2policy/testdata/complex.yaml b/cmd/robots2policy/testdata/complex.yaml index 2eb0d19..1f13be3 100644 --- a/cmd/robots2policy/testdata/complex.yaml +++ b/cmd/robots2policy/testdata/complex.yaml @@ -1,71 +1,68 @@ - action: WEIGH - expression: "true" + expression: userAgent.contains("Bingbot") name: robots-txt-policy-crawl-delay-1 weight: adjust: 5 -- action: CHALLENGE - expression: path.startsWith("/admin/") - name: robots-txt-policy-disallow-2 -- action: CHALLENGE - expression: path.startsWith("/private/") - name: robots-txt-policy-disallow-3 -- action: CHALLENGE - expression: path.startsWith("/api/internal/") - name: robots-txt-policy-disallow-4 - action: WEIGH expression: userAgent.contains("Googlebot") - name: robots-txt-policy-crawl-delay-5 + name: robots-txt-policy-crawl-delay-2 weight: adjust: 5 -- action: CHALLENGE - expression: - all: - - userAgent.contains("Googlebot") - - path.startsWith("/search/") - name: robots-txt-policy-disallow-6 - action: WEIGH - expression: userAgent.contains("Bingbot") - name: robots-txt-policy-crawl-delay-7 + expression: userAgent.contains("SeoBot") + name: robots-txt-policy-crawl-delay-3 weight: adjust: 5 +- action: WEIGH + expression: "true" + name: robots-txt-policy-crawl-delay-4 + weight: + adjust: 5 +- action: DENY + expression: + any: + - userAgent.contains("BadBot") + - userAgent.contains("SeoBot") + name: robots-txt-policy-blacklist-5 - action: CHALLENGE expression: all: - - userAgent.contains("Bingbot") - - path.startsWith("/search/") + - userAgent.contains("TestBot") + - path.matches("^/.*/admin") + name: robots-txt-policy-disallow-6 +- action: CHALLENGE + expression: + all: + - userAgent.contains("Bingbot") + - path.startsWith("/admin/") name: robots-txt-policy-disallow-8 - action: CHALLENGE - expression: - all: - - userAgent.contains("Bingbot") - - path.startsWith("/admin/") + expression: path.startsWith("/api/internal/") name: robots-txt-policy-disallow-9 -- action: DENY - expression: userAgent.contains("BadBot") - name: robots-txt-policy-blacklist-10 -- action: WEIGH - expression: userAgent.contains("SeoBot") - name: robots-txt-policy-crawl-delay-11 - weight: - adjust: 5 -- action: DENY - expression: userAgent.contains("SeoBot") - name: robots-txt-policy-blacklist-12 - action: CHALLENGE expression: all: - - userAgent.contains("TestBot") - - path.matches("^/.*/admin") + - userAgent.contains("TestBot") + - path.matches("^/file.\\.log") + name: robots-txt-policy-disallow-10 +- action: CHALLENGE + expression: path.startsWith("/private/") + name: robots-txt-policy-disallow-11 +- action: CHALLENGE + expression: + all: + - userAgent.contains("Googlebot") + - path.startsWith("/search/") name: robots-txt-policy-disallow-13 - action: CHALLENGE expression: all: - - userAgent.contains("TestBot") - - path.matches("^/temp.*\\.html") + - userAgent.contains("Bingbot") + - path.startsWith("/search/") name: robots-txt-policy-disallow-14 - action: CHALLENGE expression: all: - - userAgent.contains("TestBot") - - path.matches("^/file.\\.log") + - userAgent.contains("TestBot") + - path.matches("^/temp.*\\.html") name: robots-txt-policy-disallow-15 diff --git a/cmd/robots2policy/testdata/simple.json b/cmd/robots2policy/testdata/simple.json index 20bdf0d..c8e1de0 100644 --- a/cmd/robots2policy/testdata/simple.json +++ b/cmd/robots2policy/testdata/simple.json @@ -1,12 +1,12 @@ [ { - "action": "CHALLENGE", "expression": "path.startsWith(\"/admin/\")", - "name": "robots-txt-policy-disallow-1" + "name": "robots-txt-policy-disallow-1", + "action": "CHALLENGE" }, { - "action": "CHALLENGE", "expression": "path.startsWith(\"/private\")", - "name": "robots-txt-policy-disallow-2" + "name": "robots-txt-policy-disallow-2", + "action": "CHALLENGE" } ] \ No newline at end of file diff --git a/cmd/robots2policy/testdata/wildcards.yaml b/cmd/robots2policy/testdata/wildcards.yaml index ff51578..85be302 100644 --- a/cmd/robots2policy/testdata/wildcards.yaml +++ b/cmd/robots2policy/testdata/wildcards.yaml @@ -1,12 +1,12 @@ -- action: CHALLENGE - expression: path.matches("^/search.*") - name: robots-txt-policy-disallow-1 - action: CHALLENGE expression: path.matches("^/.*/private") + name: robots-txt-policy-disallow-1 +- action: CHALLENGE + expression: path.matches("^/admin/.*.action=delete") name: robots-txt-policy-disallow-2 - action: CHALLENGE expression: path.matches("^/file.\\.txt") name: robots-txt-policy-disallow-3 - action: CHALLENGE - expression: path.matches("^/admin/.*.action=delete") + expression: path.matches("^/search.*") name: robots-txt-policy-disallow-4