fix: handle multiple user agents

This commit is contained in:
Jason Cameron 2025-07-26 20:27:00 -04:00
parent a735770c93
commit 291ed2a084
No known key found for this signature in database
GPG Key ID: 0AA00EA6258337A5
5 changed files with 269 additions and 140 deletions

View File

@ -10,6 +10,7 @@ import (
"net/http" "net/http"
"os" "os"
"regexp" "regexp"
"sort"
"strings" "strings"
"github.com/TecharoHQ/anubis/lib/policy/config" "github.com/TecharoHQ/anubis/lib/policy/config"
@ -133,7 +134,10 @@ func main() {
func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) { func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) {
scanner := bufio.NewScanner(input) scanner := bufio.NewScanner(input)
var rules []RobotsRule var rules []RobotsRule
var currentRule *RobotsRule var currentUserAgents []string
var currentDisallows []string
var currentAllows []string
var currentCrawlDelay int
for scanner.Scan() { for scanner.Scan() {
line := strings.TrimSpace(scanner.Text()) line := strings.TrimSpace(scanner.Text())
@ -154,38 +158,60 @@ func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) {
switch directive { switch directive {
case "user-agent": case "user-agent":
// Start a new rule section // If we have accumulated rules with directives and encounter a new user-agent,
if currentRule != nil { // flush the current rules
rules = append(rules, *currentRule) if len(currentUserAgents) > 0 && (len(currentDisallows) > 0 || len(currentAllows) > 0 || currentCrawlDelay > 0) {
} for _, userAgent := range currentUserAgents {
currentRule = &RobotsRule{ rule := RobotsRule{
UserAgent: value, UserAgent: userAgent,
Disallows: make([]string, 0), Disallows: make([]string, len(currentDisallows)),
Allows: make([]string, 0), Allows: make([]string, len(currentAllows)),
CrawlDelay: currentCrawlDelay,
}
copy(rule.Disallows, currentDisallows)
copy(rule.Allows, currentAllows)
rules = append(rules, rule)
}
// Reset for next group
currentUserAgents = nil
currentDisallows = nil
currentAllows = nil
currentCrawlDelay = 0
} }
currentUserAgents = append(currentUserAgents, value)
case "disallow": case "disallow":
if currentRule != nil && value != "" { if len(currentUserAgents) > 0 && value != "" {
currentRule.Disallows = append(currentRule.Disallows, value) currentDisallows = append(currentDisallows, value)
} }
case "allow": case "allow":
if currentRule != nil && value != "" { if len(currentUserAgents) > 0 && value != "" {
currentRule.Allows = append(currentRule.Allows, value) currentAllows = append(currentAllows, value)
} }
case "crawl-delay": case "crawl-delay":
if currentRule != nil { if len(currentUserAgents) > 0 {
if delay, err := parseIntSafe(value); err == nil { if delay, err := parseIntSafe(value); err == nil {
currentRule.CrawlDelay = delay currentCrawlDelay = delay
} }
} }
} }
} }
// Don't forget the last rule // Don't forget the last group of rules
if currentRule != nil { if len(currentUserAgents) > 0 {
rules = append(rules, *currentRule) for _, userAgent := range currentUserAgents {
rule := RobotsRule{
UserAgent: userAgent,
Disallows: make([]string, len(currentDisallows)),
Allows: make([]string, len(currentAllows)),
CrawlDelay: currentCrawlDelay,
}
copy(rule.Disallows, currentDisallows)
copy(rule.Allows, currentAllows)
rules = append(rules, rule)
}
} }
// Mark blacklisted user agents (those with "Disallow: /") // Mark blacklisted user agents (those with "Disallow: /")
@ -211,39 +237,96 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
var anubisRules []AnubisRule var anubisRules []AnubisRule
ruleCounter := 0 ruleCounter := 0
// Group rules by their directives to create any blocks
blacklistGroups := make(map[string][]string) // key: directive signature, value: user agents
disallowGroups := make(map[string][]string) // key: path, value: user agents
crawlDelayGroups := make(map[string][]string) // key: delay, value: user agents
for _, robotsRule := range robotsRules { for _, robotsRule := range robotsRules {
userAgent := robotsRule.UserAgent userAgent := robotsRule.UserAgent
// Handle crawl delay as weight adjustment (do this first before any continues) // Handle crawl delay groups
if robotsRule.CrawlDelay > 0 && *crawlDelay > 0 { if robotsRule.CrawlDelay > 0 && *crawlDelay > 0 {
ruleCounter++ key := fmt.Sprintf("delay-%d", robotsRule.CrawlDelay)
rule := AnubisRule{ crawlDelayGroups[key] = append(crawlDelayGroups[key], userAgent)
Name: fmt.Sprintf("%s-crawl-delay-%d", *policyName, ruleCounter),
Action: "WEIGH",
Weight: &config.Weight{Adjust: *crawlDelay},
}
if userAgent == "*" {
rule.Expression = &config.ExpressionOrList{
All: []string{"true"}, // Always applies
}
} else {
rule.Expression = &config.ExpressionOrList{
All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)},
}
}
anubisRules = append(anubisRules, rule)
} }
// Handle blacklisted user agents (complete deny/challenge) // Handle blacklisted user agents
if robotsRule.IsBlacklist { if robotsRule.IsBlacklist {
ruleCounter++ key := "blacklist"
rule := AnubisRule{ blacklistGroups[key] = append(blacklistGroups[key], userAgent)
Name: fmt.Sprintf("%s-blacklist-%d", *policyName, ruleCounter), }
Action: *userAgentDeny,
}
// Handle specific disallow rules
for _, disallow := range robotsRule.Disallows {
if disallow == "/" {
continue // Already handled as blacklist above
}
disallowGroups[disallow] = append(disallowGroups[disallow], userAgent)
}
}
// Generate rules for crawl delays
// Sort keys for deterministic order
var crawlDelayKeys []string
for key := range crawlDelayGroups {
crawlDelayKeys = append(crawlDelayKeys, key)
}
sort.Strings(crawlDelayKeys)
for _, key := range crawlDelayKeys {
userAgents := crawlDelayGroups[key]
ruleCounter++
rule := AnubisRule{
Name: fmt.Sprintf("%s-crawl-delay-%d", *policyName, ruleCounter),
Action: "WEIGH",
Weight: &config.Weight{Adjust: *crawlDelay},
}
if len(userAgents) == 1 && userAgents[0] == "*" {
rule.Expression = &config.ExpressionOrList{
All: []string{"true"}, // Always applies
}
} else if len(userAgents) == 1 {
rule.Expression = &config.ExpressionOrList{
All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgents[0])},
}
} else {
// Multiple user agents - use any block
var expressions []string
for _, ua := range userAgents {
if ua == "*" {
expressions = append(expressions, "true")
} else {
expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua))
}
}
rule.Expression = &config.ExpressionOrList{
Any: expressions,
}
}
anubisRules = append(anubisRules, rule)
}
// Generate rules for blacklisted user agents
// Sort keys for deterministic order
var blacklistKeys []string
for key := range blacklistGroups {
blacklistKeys = append(blacklistKeys, key)
}
sort.Strings(blacklistKeys)
for _, key := range blacklistKeys {
userAgents := blacklistGroups[key]
ruleCounter++
rule := AnubisRule{
Name: fmt.Sprintf("%s-blacklist-%d", *policyName, ruleCounter),
Action: *userAgentDeny,
}
if len(userAgents) == 1 {
userAgent := userAgents[0]
if userAgent == "*" { if userAgent == "*" {
// This would block everything - convert to a weight adjustment instead // This would block everything - convert to a weight adjustment instead
rule.Name = fmt.Sprintf("%s-global-restriction-%d", *policyName, ruleCounter) rule.Name = fmt.Sprintf("%s-global-restriction-%d", *policyName, ruleCounter)
@ -257,41 +340,90 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)}, All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)},
} }
} }
anubisRules = append(anubisRules, rule) } else {
// Multiple user agents - use any block
var expressions []string
for _, ua := range userAgents {
if ua == "*" {
expressions = append(expressions, "true")
} else {
expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua))
}
}
rule.Expression = &config.ExpressionOrList{
Any: expressions,
}
}
anubisRules = append(anubisRules, rule)
}
// Generate rules for specific disallow paths
// Sort keys for deterministic order
var disallowKeys []string
for key := range disallowGroups {
disallowKeys = append(disallowKeys, key)
}
sort.Strings(disallowKeys)
for _, path := range disallowKeys {
userAgents := disallowGroups[path]
ruleCounter++
rule := AnubisRule{
Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter),
Action: *baseAction,
}
// Build CEL expression
var conditions []string
// Add user agent conditions
if len(userAgents) == 1 && userAgents[0] == "*" {
// Wildcard user agent - no user agent condition needed
} else if len(userAgents) == 1 {
conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgents[0]))
} else {
// Multiple user agents - use any block for user agents
var uaExpressions []string
for _, ua := range userAgents {
if ua == "*" {
uaExpressions = append(uaExpressions, "true")
} else {
uaExpressions = append(uaExpressions, fmt.Sprintf("userAgent.contains(%q)", ua))
}
}
// For multiple user agents, we need to use a more complex expression
// This is a limitation - we can't easily combine any for user agents with all for path
// So we'll create separate rules for each user agent
for _, ua := range userAgents {
if ua == "*" {
continue // Skip wildcard as it's handled separately
}
ruleCounter++
subRule := AnubisRule{
Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter),
Action: *baseAction,
Expression: &config.ExpressionOrList{
All: []string{
fmt.Sprintf("userAgent.contains(%q)", ua),
buildPathCondition(path),
},
},
}
anubisRules = append(anubisRules, subRule)
}
continue continue
} }
// Handle specific disallow rules // Add path condition
for _, disallow := range robotsRule.Disallows { pathCondition := buildPathCondition(path)
if disallow == "/" { conditions = append(conditions, pathCondition)
continue // Already handled as blacklist above
}
ruleCounter++ rule.Expression = &config.ExpressionOrList{
rule := AnubisRule{ All: conditions,
Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter),
Action: *baseAction,
}
// Build CEL expression
var conditions []string
// Add user agent condition if not wildcard
if userAgent != "*" {
conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgent))
}
// Add path condition
pathCondition := buildPathCondition(disallow)
conditions = append(conditions, pathCondition)
rule.Expression = &config.ExpressionOrList{
All: conditions,
}
anubisRules = append(anubisRules, rule)
} }
anubisRules = append(anubisRules, rule)
} }
return anubisRules return anubisRules

View File

@ -3,28 +3,28 @@
name: robots-txt-policy-crawl-delay-1 name: robots-txt-policy-crawl-delay-1
weight: weight:
adjust: 3 adjust: 3
- action: CHALLENGE
expression: path.startsWith("/admin")
name: robots-txt-policy-disallow-2
- action: DENY
expression: userAgent.contains("BadBot")
name: robots-txt-policy-blacklist-3
- action: WEIGH
expression: userAgent.contains("SpamBot")
name: robots-txt-policy-crawl-delay-4
weight:
adjust: 3
- action: DENY
expression: userAgent.contains("SpamBot")
name: robots-txt-policy-blacklist-5
- action: WEIGH - action: WEIGH
expression: userAgent.contains("Googlebot") expression: userAgent.contains("Googlebot")
name: robots-txt-policy-crawl-delay-6 name: robots-txt-policy-crawl-delay-2
weight: weight:
adjust: 3 adjust: 3
- action: WEIGH
expression: userAgent.contains("SpamBot")
name: robots-txt-policy-crawl-delay-3
weight:
adjust: 3
- action: DENY
expression:
any:
- userAgent.contains("BadBot")
- userAgent.contains("SpamBot")
name: robots-txt-policy-blacklist-4
- action: CHALLENGE
expression: path.startsWith("/admin")
name: robots-txt-policy-disallow-5
- action: CHALLENGE - action: CHALLENGE
expression: expression:
all: all:
- userAgent.contains("Googlebot") - userAgent.contains("Googlebot")
- path.startsWith("/search") - path.startsWith("/search")
name: robots-txt-policy-disallow-7 name: robots-txt-policy-disallow-6

View File

@ -1,71 +1,68 @@
- action: WEIGH - action: WEIGH
expression: "true" expression: userAgent.contains("Bingbot")
name: robots-txt-policy-crawl-delay-1 name: robots-txt-policy-crawl-delay-1
weight: weight:
adjust: 5 adjust: 5
- action: CHALLENGE
expression: path.startsWith("/admin/")
name: robots-txt-policy-disallow-2
- action: CHALLENGE
expression: path.startsWith("/private/")
name: robots-txt-policy-disallow-3
- action: CHALLENGE
expression: path.startsWith("/api/internal/")
name: robots-txt-policy-disallow-4
- action: WEIGH - action: WEIGH
expression: userAgent.contains("Googlebot") expression: userAgent.contains("Googlebot")
name: robots-txt-policy-crawl-delay-5 name: robots-txt-policy-crawl-delay-2
weight: weight:
adjust: 5 adjust: 5
- action: CHALLENGE
expression:
all:
- userAgent.contains("Googlebot")
- path.startsWith("/search/")
name: robots-txt-policy-disallow-6
- action: WEIGH - action: WEIGH
expression: userAgent.contains("Bingbot") expression: userAgent.contains("SeoBot")
name: robots-txt-policy-crawl-delay-7 name: robots-txt-policy-crawl-delay-3
weight: weight:
adjust: 5 adjust: 5
- action: WEIGH
expression: "true"
name: robots-txt-policy-crawl-delay-4
weight:
adjust: 5
- action: DENY
expression:
any:
- userAgent.contains("BadBot")
- userAgent.contains("SeoBot")
name: robots-txt-policy-blacklist-5
- action: CHALLENGE - action: CHALLENGE
expression: expression:
all: all:
- userAgent.contains("Bingbot") - userAgent.contains("TestBot")
- path.startsWith("/search/") - path.matches("^/.*/admin")
name: robots-txt-policy-disallow-6
- action: CHALLENGE
expression:
all:
- userAgent.contains("Bingbot")
- path.startsWith("/admin/")
name: robots-txt-policy-disallow-8 name: robots-txt-policy-disallow-8
- action: CHALLENGE - action: CHALLENGE
expression: expression: path.startsWith("/api/internal/")
all:
- userAgent.contains("Bingbot")
- path.startsWith("/admin/")
name: robots-txt-policy-disallow-9 name: robots-txt-policy-disallow-9
- action: DENY
expression: userAgent.contains("BadBot")
name: robots-txt-policy-blacklist-10
- action: WEIGH
expression: userAgent.contains("SeoBot")
name: robots-txt-policy-crawl-delay-11
weight:
adjust: 5
- action: DENY
expression: userAgent.contains("SeoBot")
name: robots-txt-policy-blacklist-12
- action: CHALLENGE - action: CHALLENGE
expression: expression:
all: all:
- userAgent.contains("TestBot") - userAgent.contains("TestBot")
- path.matches("^/.*/admin") - path.matches("^/file.\\.log")
name: robots-txt-policy-disallow-10
- action: CHALLENGE
expression: path.startsWith("/private/")
name: robots-txt-policy-disallow-11
- action: CHALLENGE
expression:
all:
- userAgent.contains("Googlebot")
- path.startsWith("/search/")
name: robots-txt-policy-disallow-13 name: robots-txt-policy-disallow-13
- action: CHALLENGE - action: CHALLENGE
expression: expression:
all: all:
- userAgent.contains("TestBot") - userAgent.contains("Bingbot")
- path.matches("^/temp.*\\.html") - path.startsWith("/search/")
name: robots-txt-policy-disallow-14 name: robots-txt-policy-disallow-14
- action: CHALLENGE - action: CHALLENGE
expression: expression:
all: all:
- userAgent.contains("TestBot") - userAgent.contains("TestBot")
- path.matches("^/file.\\.log") - path.matches("^/temp.*\\.html")
name: robots-txt-policy-disallow-15 name: robots-txt-policy-disallow-15

View File

@ -1,12 +1,12 @@
[ [
{ {
"action": "CHALLENGE",
"expression": "path.startsWith(\"/admin/\")", "expression": "path.startsWith(\"/admin/\")",
"name": "robots-txt-policy-disallow-1" "name": "robots-txt-policy-disallow-1",
"action": "CHALLENGE"
}, },
{ {
"action": "CHALLENGE",
"expression": "path.startsWith(\"/private\")", "expression": "path.startsWith(\"/private\")",
"name": "robots-txt-policy-disallow-2" "name": "robots-txt-policy-disallow-2",
"action": "CHALLENGE"
} }
] ]

View File

@ -1,12 +1,12 @@
- action: CHALLENGE
expression: path.matches("^/search.*")
name: robots-txt-policy-disallow-1
- action: CHALLENGE - action: CHALLENGE
expression: path.matches("^/.*/private") expression: path.matches("^/.*/private")
name: robots-txt-policy-disallow-1
- action: CHALLENGE
expression: path.matches("^/admin/.*.action=delete")
name: robots-txt-policy-disallow-2 name: robots-txt-policy-disallow-2
- action: CHALLENGE - action: CHALLENGE
expression: path.matches("^/file.\\.txt") expression: path.matches("^/file.\\.txt")
name: robots-txt-policy-disallow-3 name: robots-txt-policy-disallow-3
- action: CHALLENGE - action: CHALLENGE
expression: path.matches("^/admin/.*.action=delete") expression: path.matches("^/search.*")
name: robots-txt-policy-disallow-4 name: robots-txt-policy-disallow-4