mirror of
https://github.com/TecharoHQ/anubis.git
synced 2025-08-03 01:38:14 -04:00
fix: handle multiple user agents
This commit is contained in:
parent
a735770c93
commit
291ed2a084
@ -10,6 +10,7 @@ import (
|
||||
"net/http"
|
||||
"os"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"github.com/TecharoHQ/anubis/lib/policy/config"
|
||||
@ -133,7 +134,10 @@ func main() {
|
||||
func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) {
|
||||
scanner := bufio.NewScanner(input)
|
||||
var rules []RobotsRule
|
||||
var currentRule *RobotsRule
|
||||
var currentUserAgents []string
|
||||
var currentDisallows []string
|
||||
var currentAllows []string
|
||||
var currentCrawlDelay int
|
||||
|
||||
for scanner.Scan() {
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
@ -154,38 +158,60 @@ func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) {
|
||||
|
||||
switch directive {
|
||||
case "user-agent":
|
||||
// Start a new rule section
|
||||
if currentRule != nil {
|
||||
rules = append(rules, *currentRule)
|
||||
}
|
||||
currentRule = &RobotsRule{
|
||||
UserAgent: value,
|
||||
Disallows: make([]string, 0),
|
||||
Allows: make([]string, 0),
|
||||
// If we have accumulated rules with directives and encounter a new user-agent,
|
||||
// flush the current rules
|
||||
if len(currentUserAgents) > 0 && (len(currentDisallows) > 0 || len(currentAllows) > 0 || currentCrawlDelay > 0) {
|
||||
for _, userAgent := range currentUserAgents {
|
||||
rule := RobotsRule{
|
||||
UserAgent: userAgent,
|
||||
Disallows: make([]string, len(currentDisallows)),
|
||||
Allows: make([]string, len(currentAllows)),
|
||||
CrawlDelay: currentCrawlDelay,
|
||||
}
|
||||
copy(rule.Disallows, currentDisallows)
|
||||
copy(rule.Allows, currentAllows)
|
||||
rules = append(rules, rule)
|
||||
}
|
||||
// Reset for next group
|
||||
currentUserAgents = nil
|
||||
currentDisallows = nil
|
||||
currentAllows = nil
|
||||
currentCrawlDelay = 0
|
||||
}
|
||||
currentUserAgents = append(currentUserAgents, value)
|
||||
|
||||
case "disallow":
|
||||
if currentRule != nil && value != "" {
|
||||
currentRule.Disallows = append(currentRule.Disallows, value)
|
||||
if len(currentUserAgents) > 0 && value != "" {
|
||||
currentDisallows = append(currentDisallows, value)
|
||||
}
|
||||
|
||||
case "allow":
|
||||
if currentRule != nil && value != "" {
|
||||
currentRule.Allows = append(currentRule.Allows, value)
|
||||
if len(currentUserAgents) > 0 && value != "" {
|
||||
currentAllows = append(currentAllows, value)
|
||||
}
|
||||
|
||||
case "crawl-delay":
|
||||
if currentRule != nil {
|
||||
if len(currentUserAgents) > 0 {
|
||||
if delay, err := parseIntSafe(value); err == nil {
|
||||
currentRule.CrawlDelay = delay
|
||||
currentCrawlDelay = delay
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Don't forget the last rule
|
||||
if currentRule != nil {
|
||||
rules = append(rules, *currentRule)
|
||||
// Don't forget the last group of rules
|
||||
if len(currentUserAgents) > 0 {
|
||||
for _, userAgent := range currentUserAgents {
|
||||
rule := RobotsRule{
|
||||
UserAgent: userAgent,
|
||||
Disallows: make([]string, len(currentDisallows)),
|
||||
Allows: make([]string, len(currentAllows)),
|
||||
CrawlDelay: currentCrawlDelay,
|
||||
}
|
||||
copy(rule.Disallows, currentDisallows)
|
||||
copy(rule.Allows, currentAllows)
|
||||
rules = append(rules, rule)
|
||||
}
|
||||
}
|
||||
|
||||
// Mark blacklisted user agents (those with "Disallow: /")
|
||||
@ -211,39 +237,96 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
|
||||
var anubisRules []AnubisRule
|
||||
ruleCounter := 0
|
||||
|
||||
// Group rules by their directives to create any blocks
|
||||
blacklistGroups := make(map[string][]string) // key: directive signature, value: user agents
|
||||
disallowGroups := make(map[string][]string) // key: path, value: user agents
|
||||
crawlDelayGroups := make(map[string][]string) // key: delay, value: user agents
|
||||
|
||||
for _, robotsRule := range robotsRules {
|
||||
userAgent := robotsRule.UserAgent
|
||||
|
||||
// Handle crawl delay as weight adjustment (do this first before any continues)
|
||||
// Handle crawl delay groups
|
||||
if robotsRule.CrawlDelay > 0 && *crawlDelay > 0 {
|
||||
ruleCounter++
|
||||
rule := AnubisRule{
|
||||
Name: fmt.Sprintf("%s-crawl-delay-%d", *policyName, ruleCounter),
|
||||
Action: "WEIGH",
|
||||
Weight: &config.Weight{Adjust: *crawlDelay},
|
||||
}
|
||||
|
||||
if userAgent == "*" {
|
||||
rule.Expression = &config.ExpressionOrList{
|
||||
All: []string{"true"}, // Always applies
|
||||
}
|
||||
} else {
|
||||
rule.Expression = &config.ExpressionOrList{
|
||||
All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)},
|
||||
}
|
||||
}
|
||||
|
||||
anubisRules = append(anubisRules, rule)
|
||||
key := fmt.Sprintf("delay-%d", robotsRule.CrawlDelay)
|
||||
crawlDelayGroups[key] = append(crawlDelayGroups[key], userAgent)
|
||||
}
|
||||
|
||||
// Handle blacklisted user agents (complete deny/challenge)
|
||||
// Handle blacklisted user agents
|
||||
if robotsRule.IsBlacklist {
|
||||
ruleCounter++
|
||||
rule := AnubisRule{
|
||||
Name: fmt.Sprintf("%s-blacklist-%d", *policyName, ruleCounter),
|
||||
Action: *userAgentDeny,
|
||||
}
|
||||
key := "blacklist"
|
||||
blacklistGroups[key] = append(blacklistGroups[key], userAgent)
|
||||
}
|
||||
|
||||
// Handle specific disallow rules
|
||||
for _, disallow := range robotsRule.Disallows {
|
||||
if disallow == "/" {
|
||||
continue // Already handled as blacklist above
|
||||
}
|
||||
disallowGroups[disallow] = append(disallowGroups[disallow], userAgent)
|
||||
}
|
||||
}
|
||||
|
||||
// Generate rules for crawl delays
|
||||
// Sort keys for deterministic order
|
||||
var crawlDelayKeys []string
|
||||
for key := range crawlDelayGroups {
|
||||
crawlDelayKeys = append(crawlDelayKeys, key)
|
||||
}
|
||||
sort.Strings(crawlDelayKeys)
|
||||
|
||||
for _, key := range crawlDelayKeys {
|
||||
userAgents := crawlDelayGroups[key]
|
||||
ruleCounter++
|
||||
rule := AnubisRule{
|
||||
Name: fmt.Sprintf("%s-crawl-delay-%d", *policyName, ruleCounter),
|
||||
Action: "WEIGH",
|
||||
Weight: &config.Weight{Adjust: *crawlDelay},
|
||||
}
|
||||
|
||||
if len(userAgents) == 1 && userAgents[0] == "*" {
|
||||
rule.Expression = &config.ExpressionOrList{
|
||||
All: []string{"true"}, // Always applies
|
||||
}
|
||||
} else if len(userAgents) == 1 {
|
||||
rule.Expression = &config.ExpressionOrList{
|
||||
All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgents[0])},
|
||||
}
|
||||
} else {
|
||||
// Multiple user agents - use any block
|
||||
var expressions []string
|
||||
for _, ua := range userAgents {
|
||||
if ua == "*" {
|
||||
expressions = append(expressions, "true")
|
||||
} else {
|
||||
expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua))
|
||||
}
|
||||
}
|
||||
rule.Expression = &config.ExpressionOrList{
|
||||
Any: expressions,
|
||||
}
|
||||
}
|
||||
|
||||
anubisRules = append(anubisRules, rule)
|
||||
}
|
||||
|
||||
// Generate rules for blacklisted user agents
|
||||
// Sort keys for deterministic order
|
||||
var blacklistKeys []string
|
||||
for key := range blacklistGroups {
|
||||
blacklistKeys = append(blacklistKeys, key)
|
||||
}
|
||||
sort.Strings(blacklistKeys)
|
||||
|
||||
for _, key := range blacklistKeys {
|
||||
userAgents := blacklistGroups[key]
|
||||
ruleCounter++
|
||||
rule := AnubisRule{
|
||||
Name: fmt.Sprintf("%s-blacklist-%d", *policyName, ruleCounter),
|
||||
Action: *userAgentDeny,
|
||||
}
|
||||
|
||||
if len(userAgents) == 1 {
|
||||
userAgent := userAgents[0]
|
||||
if userAgent == "*" {
|
||||
// This would block everything - convert to a weight adjustment instead
|
||||
rule.Name = fmt.Sprintf("%s-global-restriction-%d", *policyName, ruleCounter)
|
||||
@ -257,41 +340,90 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
|
||||
All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)},
|
||||
}
|
||||
}
|
||||
anubisRules = append(anubisRules, rule)
|
||||
} else {
|
||||
// Multiple user agents - use any block
|
||||
var expressions []string
|
||||
for _, ua := range userAgents {
|
||||
if ua == "*" {
|
||||
expressions = append(expressions, "true")
|
||||
} else {
|
||||
expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua))
|
||||
}
|
||||
}
|
||||
rule.Expression = &config.ExpressionOrList{
|
||||
Any: expressions,
|
||||
}
|
||||
}
|
||||
|
||||
anubisRules = append(anubisRules, rule)
|
||||
}
|
||||
|
||||
// Generate rules for specific disallow paths
|
||||
// Sort keys for deterministic order
|
||||
var disallowKeys []string
|
||||
for key := range disallowGroups {
|
||||
disallowKeys = append(disallowKeys, key)
|
||||
}
|
||||
sort.Strings(disallowKeys)
|
||||
|
||||
for _, path := range disallowKeys {
|
||||
userAgents := disallowGroups[path]
|
||||
ruleCounter++
|
||||
rule := AnubisRule{
|
||||
Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter),
|
||||
Action: *baseAction,
|
||||
}
|
||||
|
||||
// Build CEL expression
|
||||
var conditions []string
|
||||
|
||||
// Add user agent conditions
|
||||
if len(userAgents) == 1 && userAgents[0] == "*" {
|
||||
// Wildcard user agent - no user agent condition needed
|
||||
} else if len(userAgents) == 1 {
|
||||
conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgents[0]))
|
||||
} else {
|
||||
// Multiple user agents - use any block for user agents
|
||||
var uaExpressions []string
|
||||
for _, ua := range userAgents {
|
||||
if ua == "*" {
|
||||
uaExpressions = append(uaExpressions, "true")
|
||||
} else {
|
||||
uaExpressions = append(uaExpressions, fmt.Sprintf("userAgent.contains(%q)", ua))
|
||||
}
|
||||
}
|
||||
// For multiple user agents, we need to use a more complex expression
|
||||
// This is a limitation - we can't easily combine any for user agents with all for path
|
||||
// So we'll create separate rules for each user agent
|
||||
for _, ua := range userAgents {
|
||||
if ua == "*" {
|
||||
continue // Skip wildcard as it's handled separately
|
||||
}
|
||||
ruleCounter++
|
||||
subRule := AnubisRule{
|
||||
Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter),
|
||||
Action: *baseAction,
|
||||
Expression: &config.ExpressionOrList{
|
||||
All: []string{
|
||||
fmt.Sprintf("userAgent.contains(%q)", ua),
|
||||
buildPathCondition(path),
|
||||
},
|
||||
},
|
||||
}
|
||||
anubisRules = append(anubisRules, subRule)
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// Handle specific disallow rules
|
||||
for _, disallow := range robotsRule.Disallows {
|
||||
if disallow == "/" {
|
||||
continue // Already handled as blacklist above
|
||||
}
|
||||
// Add path condition
|
||||
pathCondition := buildPathCondition(path)
|
||||
conditions = append(conditions, pathCondition)
|
||||
|
||||
ruleCounter++
|
||||
rule := AnubisRule{
|
||||
Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter),
|
||||
Action: *baseAction,
|
||||
}
|
||||
|
||||
// Build CEL expression
|
||||
var conditions []string
|
||||
|
||||
// Add user agent condition if not wildcard
|
||||
if userAgent != "*" {
|
||||
conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgent))
|
||||
}
|
||||
|
||||
// Add path condition
|
||||
pathCondition := buildPathCondition(disallow)
|
||||
conditions = append(conditions, pathCondition)
|
||||
|
||||
rule.Expression = &config.ExpressionOrList{
|
||||
All: conditions,
|
||||
}
|
||||
|
||||
anubisRules = append(anubisRules, rule)
|
||||
rule.Expression = &config.ExpressionOrList{
|
||||
All: conditions,
|
||||
}
|
||||
|
||||
anubisRules = append(anubisRules, rule)
|
||||
}
|
||||
|
||||
return anubisRules
|
||||
|
36
cmd/robots2policy/testdata/blacklist.yaml
vendored
36
cmd/robots2policy/testdata/blacklist.yaml
vendored
@ -3,28 +3,28 @@
|
||||
name: robots-txt-policy-crawl-delay-1
|
||||
weight:
|
||||
adjust: 3
|
||||
- action: CHALLENGE
|
||||
expression: path.startsWith("/admin")
|
||||
name: robots-txt-policy-disallow-2
|
||||
- action: DENY
|
||||
expression: userAgent.contains("BadBot")
|
||||
name: robots-txt-policy-blacklist-3
|
||||
- action: WEIGH
|
||||
expression: userAgent.contains("SpamBot")
|
||||
name: robots-txt-policy-crawl-delay-4
|
||||
weight:
|
||||
adjust: 3
|
||||
- action: DENY
|
||||
expression: userAgent.contains("SpamBot")
|
||||
name: robots-txt-policy-blacklist-5
|
||||
- action: WEIGH
|
||||
expression: userAgent.contains("Googlebot")
|
||||
name: robots-txt-policy-crawl-delay-6
|
||||
name: robots-txt-policy-crawl-delay-2
|
||||
weight:
|
||||
adjust: 3
|
||||
- action: WEIGH
|
||||
expression: userAgent.contains("SpamBot")
|
||||
name: robots-txt-policy-crawl-delay-3
|
||||
weight:
|
||||
adjust: 3
|
||||
- action: DENY
|
||||
expression:
|
||||
any:
|
||||
- userAgent.contains("BadBot")
|
||||
- userAgent.contains("SpamBot")
|
||||
name: robots-txt-policy-blacklist-4
|
||||
- action: CHALLENGE
|
||||
expression: path.startsWith("/admin")
|
||||
name: robots-txt-policy-disallow-5
|
||||
- action: CHALLENGE
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("Googlebot")
|
||||
- path.startsWith("/search")
|
||||
name: robots-txt-policy-disallow-7
|
||||
- userAgent.contains("Googlebot")
|
||||
- path.startsWith("/search")
|
||||
name: robots-txt-policy-disallow-6
|
||||
|
81
cmd/robots2policy/testdata/complex.yaml
vendored
81
cmd/robots2policy/testdata/complex.yaml
vendored
@ -1,71 +1,68 @@
|
||||
- action: WEIGH
|
||||
expression: "true"
|
||||
expression: userAgent.contains("Bingbot")
|
||||
name: robots-txt-policy-crawl-delay-1
|
||||
weight:
|
||||
adjust: 5
|
||||
- action: CHALLENGE
|
||||
expression: path.startsWith("/admin/")
|
||||
name: robots-txt-policy-disallow-2
|
||||
- action: CHALLENGE
|
||||
expression: path.startsWith("/private/")
|
||||
name: robots-txt-policy-disallow-3
|
||||
- action: CHALLENGE
|
||||
expression: path.startsWith("/api/internal/")
|
||||
name: robots-txt-policy-disallow-4
|
||||
- action: WEIGH
|
||||
expression: userAgent.contains("Googlebot")
|
||||
name: robots-txt-policy-crawl-delay-5
|
||||
name: robots-txt-policy-crawl-delay-2
|
||||
weight:
|
||||
adjust: 5
|
||||
- action: CHALLENGE
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("Googlebot")
|
||||
- path.startsWith("/search/")
|
||||
name: robots-txt-policy-disallow-6
|
||||
- action: WEIGH
|
||||
expression: userAgent.contains("Bingbot")
|
||||
name: robots-txt-policy-crawl-delay-7
|
||||
expression: userAgent.contains("SeoBot")
|
||||
name: robots-txt-policy-crawl-delay-3
|
||||
weight:
|
||||
adjust: 5
|
||||
- action: WEIGH
|
||||
expression: "true"
|
||||
name: robots-txt-policy-crawl-delay-4
|
||||
weight:
|
||||
adjust: 5
|
||||
- action: DENY
|
||||
expression:
|
||||
any:
|
||||
- userAgent.contains("BadBot")
|
||||
- userAgent.contains("SeoBot")
|
||||
name: robots-txt-policy-blacklist-5
|
||||
- action: CHALLENGE
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("Bingbot")
|
||||
- path.startsWith("/search/")
|
||||
- userAgent.contains("TestBot")
|
||||
- path.matches("^/.*/admin")
|
||||
name: robots-txt-policy-disallow-6
|
||||
- action: CHALLENGE
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("Bingbot")
|
||||
- path.startsWith("/admin/")
|
||||
name: robots-txt-policy-disallow-8
|
||||
- action: CHALLENGE
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("Bingbot")
|
||||
- path.startsWith("/admin/")
|
||||
expression: path.startsWith("/api/internal/")
|
||||
name: robots-txt-policy-disallow-9
|
||||
- action: DENY
|
||||
expression: userAgent.contains("BadBot")
|
||||
name: robots-txt-policy-blacklist-10
|
||||
- action: WEIGH
|
||||
expression: userAgent.contains("SeoBot")
|
||||
name: robots-txt-policy-crawl-delay-11
|
||||
weight:
|
||||
adjust: 5
|
||||
- action: DENY
|
||||
expression: userAgent.contains("SeoBot")
|
||||
name: robots-txt-policy-blacklist-12
|
||||
- action: CHALLENGE
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("TestBot")
|
||||
- path.matches("^/.*/admin")
|
||||
- userAgent.contains("TestBot")
|
||||
- path.matches("^/file.\\.log")
|
||||
name: robots-txt-policy-disallow-10
|
||||
- action: CHALLENGE
|
||||
expression: path.startsWith("/private/")
|
||||
name: robots-txt-policy-disallow-11
|
||||
- action: CHALLENGE
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("Googlebot")
|
||||
- path.startsWith("/search/")
|
||||
name: robots-txt-policy-disallow-13
|
||||
- action: CHALLENGE
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("TestBot")
|
||||
- path.matches("^/temp.*\\.html")
|
||||
- userAgent.contains("Bingbot")
|
||||
- path.startsWith("/search/")
|
||||
name: robots-txt-policy-disallow-14
|
||||
- action: CHALLENGE
|
||||
expression:
|
||||
all:
|
||||
- userAgent.contains("TestBot")
|
||||
- path.matches("^/file.\\.log")
|
||||
- userAgent.contains("TestBot")
|
||||
- path.matches("^/temp.*\\.html")
|
||||
name: robots-txt-policy-disallow-15
|
||||
|
8
cmd/robots2policy/testdata/simple.json
vendored
8
cmd/robots2policy/testdata/simple.json
vendored
@ -1,12 +1,12 @@
|
||||
[
|
||||
{
|
||||
"action": "CHALLENGE",
|
||||
"expression": "path.startsWith(\"/admin/\")",
|
||||
"name": "robots-txt-policy-disallow-1"
|
||||
"name": "robots-txt-policy-disallow-1",
|
||||
"action": "CHALLENGE"
|
||||
},
|
||||
{
|
||||
"action": "CHALLENGE",
|
||||
"expression": "path.startsWith(\"/private\")",
|
||||
"name": "robots-txt-policy-disallow-2"
|
||||
"name": "robots-txt-policy-disallow-2",
|
||||
"action": "CHALLENGE"
|
||||
}
|
||||
]
|
8
cmd/robots2policy/testdata/wildcards.yaml
vendored
8
cmd/robots2policy/testdata/wildcards.yaml
vendored
@ -1,12 +1,12 @@
|
||||
- action: CHALLENGE
|
||||
expression: path.matches("^/search.*")
|
||||
name: robots-txt-policy-disallow-1
|
||||
- action: CHALLENGE
|
||||
expression: path.matches("^/.*/private")
|
||||
name: robots-txt-policy-disallow-1
|
||||
- action: CHALLENGE
|
||||
expression: path.matches("^/admin/.*.action=delete")
|
||||
name: robots-txt-policy-disallow-2
|
||||
- action: CHALLENGE
|
||||
expression: path.matches("^/file.\\.txt")
|
||||
name: robots-txt-policy-disallow-3
|
||||
- action: CHALLENGE
|
||||
expression: path.matches("^/admin/.*.action=delete")
|
||||
expression: path.matches("^/search.*")
|
||||
name: robots-txt-policy-disallow-4
|
||||
|
Loading…
x
Reference in New Issue
Block a user