From 291ed2a084cf5b1a2257a0b2f313729b5532c4d1 Mon Sep 17 00:00:00 2001 From: Jason Cameron Date: Sat, 26 Jul 2025 20:27:00 -0400 Subject: [PATCH 1/4] fix: handle multiple user agents --- cmd/robots2policy/main.go | 276 ++++++++++++++++------ cmd/robots2policy/testdata/blacklist.yaml | 36 +-- cmd/robots2policy/testdata/complex.yaml | 81 +++---- cmd/robots2policy/testdata/simple.json | 8 +- cmd/robots2policy/testdata/wildcards.yaml | 8 +- 5 files changed, 269 insertions(+), 140 deletions(-) diff --git a/cmd/robots2policy/main.go b/cmd/robots2policy/main.go index eaa4d7f..8928e3c 100644 --- a/cmd/robots2policy/main.go +++ b/cmd/robots2policy/main.go @@ -10,6 +10,7 @@ import ( "net/http" "os" "regexp" + "sort" "strings" "github.com/TecharoHQ/anubis/lib/policy/config" @@ -133,7 +134,10 @@ func main() { func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) { scanner := bufio.NewScanner(input) var rules []RobotsRule - var currentRule *RobotsRule + var currentUserAgents []string + var currentDisallows []string + var currentAllows []string + var currentCrawlDelay int for scanner.Scan() { line := strings.TrimSpace(scanner.Text()) @@ -154,38 +158,60 @@ func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) { switch directive { case "user-agent": - // Start a new rule section - if currentRule != nil { - rules = append(rules, *currentRule) - } - currentRule = &RobotsRule{ - UserAgent: value, - Disallows: make([]string, 0), - Allows: make([]string, 0), + // If we have accumulated rules with directives and encounter a new user-agent, + // flush the current rules + if len(currentUserAgents) > 0 && (len(currentDisallows) > 0 || len(currentAllows) > 0 || currentCrawlDelay > 0) { + for _, userAgent := range currentUserAgents { + rule := RobotsRule{ + UserAgent: userAgent, + Disallows: make([]string, len(currentDisallows)), + Allows: make([]string, len(currentAllows)), + CrawlDelay: currentCrawlDelay, + } + copy(rule.Disallows, currentDisallows) + copy(rule.Allows, currentAllows) + rules = append(rules, rule) + } + // Reset for next group + currentUserAgents = nil + currentDisallows = nil + currentAllows = nil + currentCrawlDelay = 0 } + currentUserAgents = append(currentUserAgents, value) case "disallow": - if currentRule != nil && value != "" { - currentRule.Disallows = append(currentRule.Disallows, value) + if len(currentUserAgents) > 0 && value != "" { + currentDisallows = append(currentDisallows, value) } case "allow": - if currentRule != nil && value != "" { - currentRule.Allows = append(currentRule.Allows, value) + if len(currentUserAgents) > 0 && value != "" { + currentAllows = append(currentAllows, value) } case "crawl-delay": - if currentRule != nil { + if len(currentUserAgents) > 0 { if delay, err := parseIntSafe(value); err == nil { - currentRule.CrawlDelay = delay + currentCrawlDelay = delay } } } } - // Don't forget the last rule - if currentRule != nil { - rules = append(rules, *currentRule) + // Don't forget the last group of rules + if len(currentUserAgents) > 0 { + for _, userAgent := range currentUserAgents { + rule := RobotsRule{ + UserAgent: userAgent, + Disallows: make([]string, len(currentDisallows)), + Allows: make([]string, len(currentAllows)), + CrawlDelay: currentCrawlDelay, + } + copy(rule.Disallows, currentDisallows) + copy(rule.Allows, currentAllows) + rules = append(rules, rule) + } } // Mark blacklisted user agents (those with "Disallow: /") @@ -211,39 +237,96 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule { var anubisRules []AnubisRule ruleCounter := 0 + // Group rules by their directives to create any blocks + blacklistGroups := make(map[string][]string) // key: directive signature, value: user agents + disallowGroups := make(map[string][]string) // key: path, value: user agents + crawlDelayGroups := make(map[string][]string) // key: delay, value: user agents + for _, robotsRule := range robotsRules { userAgent := robotsRule.UserAgent - // Handle crawl delay as weight adjustment (do this first before any continues) + // Handle crawl delay groups if robotsRule.CrawlDelay > 0 && *crawlDelay > 0 { - ruleCounter++ - rule := AnubisRule{ - Name: fmt.Sprintf("%s-crawl-delay-%d", *policyName, ruleCounter), - Action: "WEIGH", - Weight: &config.Weight{Adjust: *crawlDelay}, - } - - if userAgent == "*" { - rule.Expression = &config.ExpressionOrList{ - All: []string{"true"}, // Always applies - } - } else { - rule.Expression = &config.ExpressionOrList{ - All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)}, - } - } - - anubisRules = append(anubisRules, rule) + key := fmt.Sprintf("delay-%d", robotsRule.CrawlDelay) + crawlDelayGroups[key] = append(crawlDelayGroups[key], userAgent) } - // Handle blacklisted user agents (complete deny/challenge) + // Handle blacklisted user agents if robotsRule.IsBlacklist { - ruleCounter++ - rule := AnubisRule{ - Name: fmt.Sprintf("%s-blacklist-%d", *policyName, ruleCounter), - Action: *userAgentDeny, - } + key := "blacklist" + blacklistGroups[key] = append(blacklistGroups[key], userAgent) + } + // Handle specific disallow rules + for _, disallow := range robotsRule.Disallows { + if disallow == "/" { + continue // Already handled as blacklist above + } + disallowGroups[disallow] = append(disallowGroups[disallow], userAgent) + } + } + + // Generate rules for crawl delays + // Sort keys for deterministic order + var crawlDelayKeys []string + for key := range crawlDelayGroups { + crawlDelayKeys = append(crawlDelayKeys, key) + } + sort.Strings(crawlDelayKeys) + + for _, key := range crawlDelayKeys { + userAgents := crawlDelayGroups[key] + ruleCounter++ + rule := AnubisRule{ + Name: fmt.Sprintf("%s-crawl-delay-%d", *policyName, ruleCounter), + Action: "WEIGH", + Weight: &config.Weight{Adjust: *crawlDelay}, + } + + if len(userAgents) == 1 && userAgents[0] == "*" { + rule.Expression = &config.ExpressionOrList{ + All: []string{"true"}, // Always applies + } + } else if len(userAgents) == 1 { + rule.Expression = &config.ExpressionOrList{ + All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgents[0])}, + } + } else { + // Multiple user agents - use any block + var expressions []string + for _, ua := range userAgents { + if ua == "*" { + expressions = append(expressions, "true") + } else { + expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua)) + } + } + rule.Expression = &config.ExpressionOrList{ + Any: expressions, + } + } + + anubisRules = append(anubisRules, rule) + } + + // Generate rules for blacklisted user agents + // Sort keys for deterministic order + var blacklistKeys []string + for key := range blacklistGroups { + blacklistKeys = append(blacklistKeys, key) + } + sort.Strings(blacklistKeys) + + for _, key := range blacklistKeys { + userAgents := blacklistGroups[key] + ruleCounter++ + rule := AnubisRule{ + Name: fmt.Sprintf("%s-blacklist-%d", *policyName, ruleCounter), + Action: *userAgentDeny, + } + + if len(userAgents) == 1 { + userAgent := userAgents[0] if userAgent == "*" { // This would block everything - convert to a weight adjustment instead rule.Name = fmt.Sprintf("%s-global-restriction-%d", *policyName, ruleCounter) @@ -257,41 +340,90 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule { All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)}, } } - anubisRules = append(anubisRules, rule) + } else { + // Multiple user agents - use any block + var expressions []string + for _, ua := range userAgents { + if ua == "*" { + expressions = append(expressions, "true") + } else { + expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua)) + } + } + rule.Expression = &config.ExpressionOrList{ + Any: expressions, + } + } + + anubisRules = append(anubisRules, rule) + } + + // Generate rules for specific disallow paths + // Sort keys for deterministic order + var disallowKeys []string + for key := range disallowGroups { + disallowKeys = append(disallowKeys, key) + } + sort.Strings(disallowKeys) + + for _, path := range disallowKeys { + userAgents := disallowGroups[path] + ruleCounter++ + rule := AnubisRule{ + Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter), + Action: *baseAction, + } + + // Build CEL expression + var conditions []string + + // Add user agent conditions + if len(userAgents) == 1 && userAgents[0] == "*" { + // Wildcard user agent - no user agent condition needed + } else if len(userAgents) == 1 { + conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgents[0])) + } else { + // Multiple user agents - use any block for user agents + var uaExpressions []string + for _, ua := range userAgents { + if ua == "*" { + uaExpressions = append(uaExpressions, "true") + } else { + uaExpressions = append(uaExpressions, fmt.Sprintf("userAgent.contains(%q)", ua)) + } + } + // For multiple user agents, we need to use a more complex expression + // This is a limitation - we can't easily combine any for user agents with all for path + // So we'll create separate rules for each user agent + for _, ua := range userAgents { + if ua == "*" { + continue // Skip wildcard as it's handled separately + } + ruleCounter++ + subRule := AnubisRule{ + Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter), + Action: *baseAction, + Expression: &config.ExpressionOrList{ + All: []string{ + fmt.Sprintf("userAgent.contains(%q)", ua), + buildPathCondition(path), + }, + }, + } + anubisRules = append(anubisRules, subRule) + } continue } - // Handle specific disallow rules - for _, disallow := range robotsRule.Disallows { - if disallow == "/" { - continue // Already handled as blacklist above - } + // Add path condition + pathCondition := buildPathCondition(path) + conditions = append(conditions, pathCondition) - ruleCounter++ - rule := AnubisRule{ - Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter), - Action: *baseAction, - } - - // Build CEL expression - var conditions []string - - // Add user agent condition if not wildcard - if userAgent != "*" { - conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgent)) - } - - // Add path condition - pathCondition := buildPathCondition(disallow) - conditions = append(conditions, pathCondition) - - rule.Expression = &config.ExpressionOrList{ - All: conditions, - } - - anubisRules = append(anubisRules, rule) + rule.Expression = &config.ExpressionOrList{ + All: conditions, } + anubisRules = append(anubisRules, rule) } return anubisRules diff --git a/cmd/robots2policy/testdata/blacklist.yaml b/cmd/robots2policy/testdata/blacklist.yaml index b22f06f..b0a93d1 100644 --- a/cmd/robots2policy/testdata/blacklist.yaml +++ b/cmd/robots2policy/testdata/blacklist.yaml @@ -3,28 +3,28 @@ name: robots-txt-policy-crawl-delay-1 weight: adjust: 3 -- action: CHALLENGE - expression: path.startsWith("/admin") - name: robots-txt-policy-disallow-2 -- action: DENY - expression: userAgent.contains("BadBot") - name: robots-txt-policy-blacklist-3 -- action: WEIGH - expression: userAgent.contains("SpamBot") - name: robots-txt-policy-crawl-delay-4 - weight: - adjust: 3 -- action: DENY - expression: userAgent.contains("SpamBot") - name: robots-txt-policy-blacklist-5 - action: WEIGH expression: userAgent.contains("Googlebot") - name: robots-txt-policy-crawl-delay-6 + name: robots-txt-policy-crawl-delay-2 weight: adjust: 3 +- action: WEIGH + expression: userAgent.contains("SpamBot") + name: robots-txt-policy-crawl-delay-3 + weight: + adjust: 3 +- action: DENY + expression: + any: + - userAgent.contains("BadBot") + - userAgent.contains("SpamBot") + name: robots-txt-policy-blacklist-4 +- action: CHALLENGE + expression: path.startsWith("/admin") + name: robots-txt-policy-disallow-5 - action: CHALLENGE expression: all: - - userAgent.contains("Googlebot") - - path.startsWith("/search") - name: robots-txt-policy-disallow-7 \ No newline at end of file + - userAgent.contains("Googlebot") + - path.startsWith("/search") + name: robots-txt-policy-disallow-6 diff --git a/cmd/robots2policy/testdata/complex.yaml b/cmd/robots2policy/testdata/complex.yaml index 2eb0d19..1f13be3 100644 --- a/cmd/robots2policy/testdata/complex.yaml +++ b/cmd/robots2policy/testdata/complex.yaml @@ -1,71 +1,68 @@ - action: WEIGH - expression: "true" + expression: userAgent.contains("Bingbot") name: robots-txt-policy-crawl-delay-1 weight: adjust: 5 -- action: CHALLENGE - expression: path.startsWith("/admin/") - name: robots-txt-policy-disallow-2 -- action: CHALLENGE - expression: path.startsWith("/private/") - name: robots-txt-policy-disallow-3 -- action: CHALLENGE - expression: path.startsWith("/api/internal/") - name: robots-txt-policy-disallow-4 - action: WEIGH expression: userAgent.contains("Googlebot") - name: robots-txt-policy-crawl-delay-5 + name: robots-txt-policy-crawl-delay-2 weight: adjust: 5 -- action: CHALLENGE - expression: - all: - - userAgent.contains("Googlebot") - - path.startsWith("/search/") - name: robots-txt-policy-disallow-6 - action: WEIGH - expression: userAgent.contains("Bingbot") - name: robots-txt-policy-crawl-delay-7 + expression: userAgent.contains("SeoBot") + name: robots-txt-policy-crawl-delay-3 weight: adjust: 5 +- action: WEIGH + expression: "true" + name: robots-txt-policy-crawl-delay-4 + weight: + adjust: 5 +- action: DENY + expression: + any: + - userAgent.contains("BadBot") + - userAgent.contains("SeoBot") + name: robots-txt-policy-blacklist-5 - action: CHALLENGE expression: all: - - userAgent.contains("Bingbot") - - path.startsWith("/search/") + - userAgent.contains("TestBot") + - path.matches("^/.*/admin") + name: robots-txt-policy-disallow-6 +- action: CHALLENGE + expression: + all: + - userAgent.contains("Bingbot") + - path.startsWith("/admin/") name: robots-txt-policy-disallow-8 - action: CHALLENGE - expression: - all: - - userAgent.contains("Bingbot") - - path.startsWith("/admin/") + expression: path.startsWith("/api/internal/") name: robots-txt-policy-disallow-9 -- action: DENY - expression: userAgent.contains("BadBot") - name: robots-txt-policy-blacklist-10 -- action: WEIGH - expression: userAgent.contains("SeoBot") - name: robots-txt-policy-crawl-delay-11 - weight: - adjust: 5 -- action: DENY - expression: userAgent.contains("SeoBot") - name: robots-txt-policy-blacklist-12 - action: CHALLENGE expression: all: - - userAgent.contains("TestBot") - - path.matches("^/.*/admin") + - userAgent.contains("TestBot") + - path.matches("^/file.\\.log") + name: robots-txt-policy-disallow-10 +- action: CHALLENGE + expression: path.startsWith("/private/") + name: robots-txt-policy-disallow-11 +- action: CHALLENGE + expression: + all: + - userAgent.contains("Googlebot") + - path.startsWith("/search/") name: robots-txt-policy-disallow-13 - action: CHALLENGE expression: all: - - userAgent.contains("TestBot") - - path.matches("^/temp.*\\.html") + - userAgent.contains("Bingbot") + - path.startsWith("/search/") name: robots-txt-policy-disallow-14 - action: CHALLENGE expression: all: - - userAgent.contains("TestBot") - - path.matches("^/file.\\.log") + - userAgent.contains("TestBot") + - path.matches("^/temp.*\\.html") name: robots-txt-policy-disallow-15 diff --git a/cmd/robots2policy/testdata/simple.json b/cmd/robots2policy/testdata/simple.json index 20bdf0d..c8e1de0 100644 --- a/cmd/robots2policy/testdata/simple.json +++ b/cmd/robots2policy/testdata/simple.json @@ -1,12 +1,12 @@ [ { - "action": "CHALLENGE", "expression": "path.startsWith(\"/admin/\")", - "name": "robots-txt-policy-disallow-1" + "name": "robots-txt-policy-disallow-1", + "action": "CHALLENGE" }, { - "action": "CHALLENGE", "expression": "path.startsWith(\"/private\")", - "name": "robots-txt-policy-disallow-2" + "name": "robots-txt-policy-disallow-2", + "action": "CHALLENGE" } ] \ No newline at end of file diff --git a/cmd/robots2policy/testdata/wildcards.yaml b/cmd/robots2policy/testdata/wildcards.yaml index ff51578..85be302 100644 --- a/cmd/robots2policy/testdata/wildcards.yaml +++ b/cmd/robots2policy/testdata/wildcards.yaml @@ -1,12 +1,12 @@ -- action: CHALLENGE - expression: path.matches("^/search.*") - name: robots-txt-policy-disallow-1 - action: CHALLENGE expression: path.matches("^/.*/private") + name: robots-txt-policy-disallow-1 +- action: CHALLENGE + expression: path.matches("^/admin/.*.action=delete") name: robots-txt-policy-disallow-2 - action: CHALLENGE expression: path.matches("^/file.\\.txt") name: robots-txt-policy-disallow-3 - action: CHALLENGE - expression: path.matches("^/admin/.*.action=delete") + expression: path.matches("^/search.*") name: robots-txt-policy-disallow-4 From e51b4bd9657349722b36944e7ec0088c2243c7bc Mon Sep 17 00:00:00 2001 From: Jason Cameron Date: Sat, 26 Jul 2025 20:34:04 -0400 Subject: [PATCH 2/4] fix: collate --- cmd/robots2policy/main.go | 321 ++++++++---------- cmd/robots2policy/robots2policy_test.go | 6 + cmd/robots2policy/testdata/blacklist.yaml | 30 +- cmd/robots2policy/testdata/complex.yaml | 93 ++--- .../testdata/consecutive.robots.txt | 25 ++ cmd/robots2policy/testdata/consecutive.yaml | 47 +++ cmd/robots2policy/testdata/wildcards.yaml | 6 +- 7 files changed, 280 insertions(+), 248 deletions(-) create mode 100644 cmd/robots2policy/testdata/consecutive.robots.txt create mode 100644 cmd/robots2policy/testdata/consecutive.yaml diff --git a/cmd/robots2policy/main.go b/cmd/robots2policy/main.go index 8928e3c..9bd2c9c 100644 --- a/cmd/robots2policy/main.go +++ b/cmd/robots2policy/main.go @@ -10,7 +10,6 @@ import ( "net/http" "os" "regexp" - "sort" "strings" "github.com/TecharoHQ/anubis/lib/policy/config" @@ -30,7 +29,7 @@ var ( ) type RobotsRule struct { - UserAgent string + UserAgents []string Disallows []string Allows []string CrawlDelay int @@ -161,17 +160,16 @@ func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) { // If we have accumulated rules with directives and encounter a new user-agent, // flush the current rules if len(currentUserAgents) > 0 && (len(currentDisallows) > 0 || len(currentAllows) > 0 || currentCrawlDelay > 0) { - for _, userAgent := range currentUserAgents { - rule := RobotsRule{ - UserAgent: userAgent, - Disallows: make([]string, len(currentDisallows)), - Allows: make([]string, len(currentAllows)), - CrawlDelay: currentCrawlDelay, - } - copy(rule.Disallows, currentDisallows) - copy(rule.Allows, currentAllows) - rules = append(rules, rule) + rule := RobotsRule{ + UserAgents: make([]string, len(currentUserAgents)), + Disallows: make([]string, len(currentDisallows)), + Allows: make([]string, len(currentAllows)), + CrawlDelay: currentCrawlDelay, } + copy(rule.UserAgents, currentUserAgents) + copy(rule.Disallows, currentDisallows) + copy(rule.Allows, currentAllows) + rules = append(rules, rule) // Reset for next group currentUserAgents = nil currentDisallows = nil @@ -201,17 +199,16 @@ func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) { // Don't forget the last group of rules if len(currentUserAgents) > 0 { - for _, userAgent := range currentUserAgents { - rule := RobotsRule{ - UserAgent: userAgent, - Disallows: make([]string, len(currentDisallows)), - Allows: make([]string, len(currentAllows)), - CrawlDelay: currentCrawlDelay, - } - copy(rule.Disallows, currentDisallows) - copy(rule.Allows, currentAllows) - rules = append(rules, rule) + rule := RobotsRule{ + UserAgents: make([]string, len(currentUserAgents)), + Disallows: make([]string, len(currentDisallows)), + Allows: make([]string, len(currentAllows)), + CrawlDelay: currentCrawlDelay, } + copy(rule.UserAgents, currentUserAgents) + copy(rule.Disallows, currentDisallows) + copy(rule.Allows, currentAllows) + rules = append(rules, rule) } // Mark blacklisted user agents (those with "Disallow: /") @@ -237,24 +234,82 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule { var anubisRules []AnubisRule ruleCounter := 0 - // Group rules by their directives to create any blocks - blacklistGroups := make(map[string][]string) // key: directive signature, value: user agents - disallowGroups := make(map[string][]string) // key: path, value: user agents - crawlDelayGroups := make(map[string][]string) // key: delay, value: user agents - + // Process each robots rule individually for _, robotsRule := range robotsRules { - userAgent := robotsRule.UserAgent + userAgents := robotsRule.UserAgents - // Handle crawl delay groups + // Handle crawl delay if robotsRule.CrawlDelay > 0 && *crawlDelay > 0 { - key := fmt.Sprintf("delay-%d", robotsRule.CrawlDelay) - crawlDelayGroups[key] = append(crawlDelayGroups[key], userAgent) + ruleCounter++ + rule := AnubisRule{ + Name: fmt.Sprintf("%s-crawl-delay-%d", *policyName, ruleCounter), + Action: "WEIGH", + Weight: &config.Weight{Adjust: *crawlDelay}, + } + + if len(userAgents) == 1 && userAgents[0] == "*" { + rule.Expression = &config.ExpressionOrList{ + All: []string{"true"}, // Always applies + } + } else if len(userAgents) == 1 { + rule.Expression = &config.ExpressionOrList{ + All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgents[0])}, + } + } else { + // Multiple user agents - use any block + var expressions []string + for _, ua := range userAgents { + if ua == "*" { + expressions = append(expressions, "true") + } else { + expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua)) + } + } + rule.Expression = &config.ExpressionOrList{ + Any: expressions, + } + } + anubisRules = append(anubisRules, rule) } // Handle blacklisted user agents if robotsRule.IsBlacklist { - key := "blacklist" - blacklistGroups[key] = append(blacklistGroups[key], userAgent) + ruleCounter++ + rule := AnubisRule{ + Name: fmt.Sprintf("%s-blacklist-%d", *policyName, ruleCounter), + Action: *userAgentDeny, + } + + if len(userAgents) == 1 { + userAgent := userAgents[0] + if userAgent == "*" { + // This would block everything - convert to a weight adjustment instead + rule.Name = fmt.Sprintf("%s-global-restriction-%d", *policyName, ruleCounter) + rule.Action = "WEIGH" + rule.Weight = &config.Weight{Adjust: 20} // Increase difficulty significantly + rule.Expression = &config.ExpressionOrList{ + All: []string{"true"}, // Always applies + } + } else { + rule.Expression = &config.ExpressionOrList{ + All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)}, + } + } + } else { + // Multiple user agents - use any block + var expressions []string + for _, ua := range userAgents { + if ua == "*" { + expressions = append(expressions, "true") + } else { + expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua)) + } + } + rule.Expression = &config.ExpressionOrList{ + Any: expressions, + } + } + anubisRules = append(anubisRules, rule) } // Handle specific disallow rules @@ -262,168 +317,64 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule { if disallow == "/" { continue // Already handled as blacklist above } - disallowGroups[disallow] = append(disallowGroups[disallow], userAgent) - } - } - // Generate rules for crawl delays - // Sort keys for deterministic order - var crawlDelayKeys []string - for key := range crawlDelayGroups { - crawlDelayKeys = append(crawlDelayKeys, key) - } - sort.Strings(crawlDelayKeys) - - for _, key := range crawlDelayKeys { - userAgents := crawlDelayGroups[key] - ruleCounter++ - rule := AnubisRule{ - Name: fmt.Sprintf("%s-crawl-delay-%d", *policyName, ruleCounter), - Action: "WEIGH", - Weight: &config.Weight{Adjust: *crawlDelay}, - } - - if len(userAgents) == 1 && userAgents[0] == "*" { - rule.Expression = &config.ExpressionOrList{ - All: []string{"true"}, // Always applies + ruleCounter++ + rule := AnubisRule{ + Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter), + Action: *baseAction, } - } else if len(userAgents) == 1 { - rule.Expression = &config.ExpressionOrList{ - All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgents[0])}, - } - } else { - // Multiple user agents - use any block - var expressions []string - for _, ua := range userAgents { - if ua == "*" { - expressions = append(expressions, "true") - } else { - expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua)) - } - } - rule.Expression = &config.ExpressionOrList{ - Any: expressions, - } - } - anubisRules = append(anubisRules, rule) - } + // Build CEL expression + var conditions []string - // Generate rules for blacklisted user agents - // Sort keys for deterministic order - var blacklistKeys []string - for key := range blacklistGroups { - blacklistKeys = append(blacklistKeys, key) - } - sort.Strings(blacklistKeys) - - for _, key := range blacklistKeys { - userAgents := blacklistGroups[key] - ruleCounter++ - rule := AnubisRule{ - Name: fmt.Sprintf("%s-blacklist-%d", *policyName, ruleCounter), - Action: *userAgentDeny, - } - - if len(userAgents) == 1 { - userAgent := userAgents[0] - if userAgent == "*" { - // This would block everything - convert to a weight adjustment instead - rule.Name = fmt.Sprintf("%s-global-restriction-%d", *policyName, ruleCounter) - rule.Action = "WEIGH" - rule.Weight = &config.Weight{Adjust: 20} // Increase difficulty significantly - rule.Expression = &config.ExpressionOrList{ - All: []string{"true"}, // Always applies - } + // Add user agent conditions + if len(userAgents) == 1 && userAgents[0] == "*" { + // Wildcard user agent - no user agent condition needed + } else if len(userAgents) == 1 { + conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgents[0])) } else { - rule.Expression = &config.ExpressionOrList{ - All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)}, + // Multiple user agents - use any block for user agents + var uaExpressions []string + for _, ua := range userAgents { + if ua == "*" { + uaExpressions = append(uaExpressions, "true") + } else { + uaExpressions = append(uaExpressions, fmt.Sprintf("userAgent.contains(%q)", ua)) + } } - } - } else { - // Multiple user agents - use any block - var expressions []string - for _, ua := range userAgents { - if ua == "*" { - expressions = append(expressions, "true") - } else { - expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua)) - } - } - rule.Expression = &config.ExpressionOrList{ - Any: expressions, - } - } - - anubisRules = append(anubisRules, rule) - } - - // Generate rules for specific disallow paths - // Sort keys for deterministic order - var disallowKeys []string - for key := range disallowGroups { - disallowKeys = append(disallowKeys, key) - } - sort.Strings(disallowKeys) - - for _, path := range disallowKeys { - userAgents := disallowGroups[path] - ruleCounter++ - rule := AnubisRule{ - Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter), - Action: *baseAction, - } - - // Build CEL expression - var conditions []string - - // Add user agent conditions - if len(userAgents) == 1 && userAgents[0] == "*" { - // Wildcard user agent - no user agent condition needed - } else if len(userAgents) == 1 { - conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgents[0])) - } else { - // Multiple user agents - use any block for user agents - var uaExpressions []string - for _, ua := range userAgents { - if ua == "*" { - uaExpressions = append(uaExpressions, "true") - } else { - uaExpressions = append(uaExpressions, fmt.Sprintf("userAgent.contains(%q)", ua)) - } - } - // For multiple user agents, we need to use a more complex expression - // This is a limitation - we can't easily combine any for user agents with all for path - // So we'll create separate rules for each user agent - for _, ua := range userAgents { - if ua == "*" { - continue // Skip wildcard as it's handled separately - } - ruleCounter++ - subRule := AnubisRule{ - Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter), - Action: *baseAction, - Expression: &config.ExpressionOrList{ - All: []string{ - fmt.Sprintf("userAgent.contains(%q)", ua), - buildPathCondition(path), + // For multiple user agents, we need to use a more complex expression + // This is a limitation - we can't easily combine any for user agents with all for path + // So we'll create separate rules for each user agent + for _, ua := range userAgents { + if ua == "*" { + continue // Skip wildcard as it's handled separately + } + ruleCounter++ + subRule := AnubisRule{ + Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter), + Action: *baseAction, + Expression: &config.ExpressionOrList{ + All: []string{ + fmt.Sprintf("userAgent.contains(%q)", ua), + buildPathCondition(disallow), + }, }, - }, + } + anubisRules = append(anubisRules, subRule) } - anubisRules = append(anubisRules, subRule) + continue } - continue + + // Add path condition + pathCondition := buildPathCondition(disallow) + conditions = append(conditions, pathCondition) + + rule.Expression = &config.ExpressionOrList{ + All: conditions, + } + + anubisRules = append(anubisRules, rule) } - - // Add path condition - pathCondition := buildPathCondition(path) - conditions = append(conditions, pathCondition) - - rule.Expression = &config.ExpressionOrList{ - All: conditions, - } - - anubisRules = append(anubisRules, rule) } return anubisRules diff --git a/cmd/robots2policy/robots2policy_test.go b/cmd/robots2policy/robots2policy_test.go index aa73f6b..e9d90e6 100644 --- a/cmd/robots2policy/robots2policy_test.go +++ b/cmd/robots2policy/robots2policy_test.go @@ -78,6 +78,12 @@ func TestDataFileConversion(t *testing.T) { expectedFile: "complex.yaml", options: TestOptions{format: "yaml", crawlDelayWeight: 5}, }, + { + name: "consecutive_user_agents", + robotsFile: "consecutive.robots.txt", + expectedFile: "consecutive.yaml", + options: TestOptions{format: "yaml", crawlDelayWeight: 3}, + }, } for _, tc := range testCases { diff --git a/cmd/robots2policy/testdata/blacklist.yaml b/cmd/robots2policy/testdata/blacklist.yaml index b0a93d1..a3096f5 100644 --- a/cmd/robots2policy/testdata/blacklist.yaml +++ b/cmd/robots2policy/testdata/blacklist.yaml @@ -3,28 +3,28 @@ name: robots-txt-policy-crawl-delay-1 weight: adjust: 3 -- action: WEIGH - expression: userAgent.contains("Googlebot") - name: robots-txt-policy-crawl-delay-2 - weight: - adjust: 3 +- action: CHALLENGE + expression: path.startsWith("/admin") + name: robots-txt-policy-disallow-2 +- action: DENY + expression: userAgent.contains("BadBot") + name: robots-txt-policy-blacklist-3 - action: WEIGH expression: userAgent.contains("SpamBot") - name: robots-txt-policy-crawl-delay-3 + name: robots-txt-policy-crawl-delay-4 weight: adjust: 3 - action: DENY - expression: - any: - - userAgent.contains("BadBot") - - userAgent.contains("SpamBot") - name: robots-txt-policy-blacklist-4 -- action: CHALLENGE - expression: path.startsWith("/admin") - name: robots-txt-policy-disallow-5 + expression: userAgent.contains("SpamBot") + name: robots-txt-policy-blacklist-5 +- action: WEIGH + expression: userAgent.contains("Googlebot") + name: robots-txt-policy-crawl-delay-6 + weight: + adjust: 3 - action: CHALLENGE expression: all: - userAgent.contains("Googlebot") - path.startsWith("/search") - name: robots-txt-policy-disallow-6 + name: robots-txt-policy-disallow-7 diff --git a/cmd/robots2policy/testdata/complex.yaml b/cmd/robots2policy/testdata/complex.yaml index 1f13be3..6e677ad 100644 --- a/cmd/robots2policy/testdata/complex.yaml +++ b/cmd/robots2policy/testdata/complex.yaml @@ -1,68 +1,71 @@ - action: WEIGH - expression: userAgent.contains("Bingbot") + expression: "true" name: robots-txt-policy-crawl-delay-1 weight: adjust: 5 -- action: WEIGH - expression: userAgent.contains("Googlebot") - name: robots-txt-policy-crawl-delay-2 - weight: - adjust: 5 -- action: WEIGH - expression: userAgent.contains("SeoBot") - name: robots-txt-policy-crawl-delay-3 - weight: - adjust: 5 -- action: WEIGH - expression: "true" - name: robots-txt-policy-crawl-delay-4 - weight: - adjust: 5 -- action: DENY - expression: - any: - - userAgent.contains("BadBot") - - userAgent.contains("SeoBot") - name: robots-txt-policy-blacklist-5 - action: CHALLENGE - expression: - all: - - userAgent.contains("TestBot") - - path.matches("^/.*/admin") - name: robots-txt-policy-disallow-6 -- action: CHALLENGE - expression: - all: - - userAgent.contains("Bingbot") - - path.startsWith("/admin/") - name: robots-txt-policy-disallow-8 -- action: CHALLENGE - expression: path.startsWith("/api/internal/") - name: robots-txt-policy-disallow-9 -- action: CHALLENGE - expression: - all: - - userAgent.contains("TestBot") - - path.matches("^/file.\\.log") - name: robots-txt-policy-disallow-10 + expression: path.startsWith("/admin/") + name: robots-txt-policy-disallow-2 - action: CHALLENGE expression: path.startsWith("/private/") - name: robots-txt-policy-disallow-11 + name: robots-txt-policy-disallow-3 +- action: CHALLENGE + expression: path.startsWith("/api/internal/") + name: robots-txt-policy-disallow-4 +- action: WEIGH + expression: userAgent.contains("Googlebot") + name: robots-txt-policy-crawl-delay-5 + weight: + adjust: 5 - action: CHALLENGE expression: all: - userAgent.contains("Googlebot") - path.startsWith("/search/") - name: robots-txt-policy-disallow-13 + name: robots-txt-policy-disallow-6 +- action: WEIGH + expression: userAgent.contains("Bingbot") + name: robots-txt-policy-crawl-delay-7 + weight: + adjust: 5 - action: CHALLENGE expression: all: - userAgent.contains("Bingbot") - path.startsWith("/search/") - name: robots-txt-policy-disallow-14 + name: robots-txt-policy-disallow-8 +- action: CHALLENGE + expression: + all: + - userAgent.contains("Bingbot") + - path.startsWith("/admin/") + name: robots-txt-policy-disallow-9 +- action: DENY + expression: userAgent.contains("BadBot") + name: robots-txt-policy-blacklist-10 +- action: WEIGH + expression: userAgent.contains("SeoBot") + name: robots-txt-policy-crawl-delay-11 + weight: + adjust: 5 +- action: DENY + expression: userAgent.contains("SeoBot") + name: robots-txt-policy-blacklist-12 +- action: CHALLENGE + expression: + all: + - userAgent.contains("TestBot") + - path.matches("^/.*/admin") + name: robots-txt-policy-disallow-13 - action: CHALLENGE expression: all: - userAgent.contains("TestBot") - path.matches("^/temp.*\\.html") + name: robots-txt-policy-disallow-14 +- action: CHALLENGE + expression: + all: + - userAgent.contains("TestBot") + - path.matches("^/file.\\.log") name: robots-txt-policy-disallow-15 diff --git a/cmd/robots2policy/testdata/consecutive.robots.txt b/cmd/robots2policy/testdata/consecutive.robots.txt new file mode 100644 index 0000000..e4f6cb5 --- /dev/null +++ b/cmd/robots2policy/testdata/consecutive.robots.txt @@ -0,0 +1,25 @@ +# Test consecutive user agents that should be grouped into any: blocks +User-agent: * +Disallow: /admin +Crawl-delay: 10 + +# Multiple consecutive user agents - should be grouped +User-agent: BadBot +User-agent: SpamBot +User-agent: EvilBot +Disallow: / + +# Single user agent - should be separate +User-agent: GoodBot +Disallow: /private + +# Multiple consecutive user agents with crawl delay +User-agent: SlowBot1 +User-agent: SlowBot2 +Crawl-delay: 5 + +# Multiple consecutive user agents with specific path +User-agent: SearchBot1 +User-agent: SearchBot2 +User-agent: SearchBot3 +Disallow: /search \ No newline at end of file diff --git a/cmd/robots2policy/testdata/consecutive.yaml b/cmd/robots2policy/testdata/consecutive.yaml new file mode 100644 index 0000000..144abda --- /dev/null +++ b/cmd/robots2policy/testdata/consecutive.yaml @@ -0,0 +1,47 @@ +- action: WEIGH + expression: "true" + name: robots-txt-policy-crawl-delay-1 + weight: + adjust: 3 +- action: CHALLENGE + expression: path.startsWith("/admin") + name: robots-txt-policy-disallow-2 +- action: DENY + expression: + any: + - userAgent.contains("BadBot") + - userAgent.contains("SpamBot") + - userAgent.contains("EvilBot") + name: robots-txt-policy-blacklist-3 +- action: CHALLENGE + expression: + all: + - userAgent.contains("GoodBot") + - path.startsWith("/private") + name: robots-txt-policy-disallow-4 +- action: WEIGH + expression: + any: + - userAgent.contains("SlowBot1") + - userAgent.contains("SlowBot2") + name: robots-txt-policy-crawl-delay-5 + weight: + adjust: 3 +- action: CHALLENGE + expression: + all: + - userAgent.contains("SearchBot1") + - path.startsWith("/search") + name: robots-txt-policy-disallow-7 +- action: CHALLENGE + expression: + all: + - userAgent.contains("SearchBot2") + - path.startsWith("/search") + name: robots-txt-policy-disallow-8 +- action: CHALLENGE + expression: + all: + - userAgent.contains("SearchBot3") + - path.startsWith("/search") + name: robots-txt-policy-disallow-9 diff --git a/cmd/robots2policy/testdata/wildcards.yaml b/cmd/robots2policy/testdata/wildcards.yaml index 85be302..ff51578 100644 --- a/cmd/robots2policy/testdata/wildcards.yaml +++ b/cmd/robots2policy/testdata/wildcards.yaml @@ -1,12 +1,12 @@ - action: CHALLENGE - expression: path.matches("^/.*/private") + expression: path.matches("^/search.*") name: robots-txt-policy-disallow-1 - action: CHALLENGE - expression: path.matches("^/admin/.*.action=delete") + expression: path.matches("^/.*/private") name: robots-txt-policy-disallow-2 - action: CHALLENGE expression: path.matches("^/file.\\.txt") name: robots-txt-policy-disallow-3 - action: CHALLENGE - expression: path.matches("^/search.*") + expression: path.matches("^/admin/.*.action=delete") name: robots-txt-policy-disallow-4 From 61db9a618d884600c97530bd9c8f12bc6f3ed3e1 Mon Sep 17 00:00:00 2001 From: Jason Cameron Date: Sat, 26 Jul 2025 20:43:52 -0400 Subject: [PATCH 3/4] docs: add changelog entry --- docs/docs/CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md index 5970c61..6cf062d 100644 --- a/docs/docs/CHANGELOG.md +++ b/docs/docs/CHANGELOG.md @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - The [Thoth client](https://anubis.techaro.lol/docs/admin/thoth) is now public in the repo instead of being an internal package. - The [`segments`](./admin/configuration/expressions.mdx#segments) function was added for splitting a path into its slash-separated segments. +- Fixed `robots2policy` to properly group consecutive user agents into `any:` instead of only processing the last one ## v1.21.3: Minfilia Warde - Echo 3 From 2e7f37215c912bf0448fd8a45973683a37a4f96b Mon Sep 17 00:00:00 2001 From: Jason Cameron Date: Sat, 26 Jul 2025 20:49:54 -0400 Subject: [PATCH 4/4] fix: remove unused blocks --- cmd/robots2policy/main.go | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/cmd/robots2policy/main.go b/cmd/robots2policy/main.go index 9bd2c9c..a0224d4 100644 --- a/cmd/robots2policy/main.go +++ b/cmd/robots2policy/main.go @@ -29,7 +29,7 @@ var ( ) type RobotsRule struct { - UserAgents []string + UserAgents []string Disallows []string Allows []string CrawlDelay int @@ -333,15 +333,6 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule { } else if len(userAgents) == 1 { conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgents[0])) } else { - // Multiple user agents - use any block for user agents - var uaExpressions []string - for _, ua := range userAgents { - if ua == "*" { - uaExpressions = append(uaExpressions, "true") - } else { - uaExpressions = append(uaExpressions, fmt.Sprintf("userAgent.contains(%q)", ua)) - } - } // For multiple user agents, we need to use a more complex expression // This is a limitation - we can't easily combine any for user agents with all for path // So we'll create separate rules for each user agent