fix: collate

This commit is contained in:
Jason Cameron 2025-07-26 20:34:04 -04:00
parent 291ed2a084
commit e51b4bd965
No known key found for this signature in database
GPG Key ID: 0AA00EA6258337A5
7 changed files with 280 additions and 248 deletions

View File

@ -10,7 +10,6 @@ import (
"net/http" "net/http"
"os" "os"
"regexp" "regexp"
"sort"
"strings" "strings"
"github.com/TecharoHQ/anubis/lib/policy/config" "github.com/TecharoHQ/anubis/lib/policy/config"
@ -30,7 +29,7 @@ var (
) )
type RobotsRule struct { type RobotsRule struct {
UserAgent string UserAgents []string
Disallows []string Disallows []string
Allows []string Allows []string
CrawlDelay int CrawlDelay int
@ -161,17 +160,16 @@ func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) {
// If we have accumulated rules with directives and encounter a new user-agent, // If we have accumulated rules with directives and encounter a new user-agent,
// flush the current rules // flush the current rules
if len(currentUserAgents) > 0 && (len(currentDisallows) > 0 || len(currentAllows) > 0 || currentCrawlDelay > 0) { if len(currentUserAgents) > 0 && (len(currentDisallows) > 0 || len(currentAllows) > 0 || currentCrawlDelay > 0) {
for _, userAgent := range currentUserAgents {
rule := RobotsRule{ rule := RobotsRule{
UserAgent: userAgent, UserAgents: make([]string, len(currentUserAgents)),
Disallows: make([]string, len(currentDisallows)), Disallows: make([]string, len(currentDisallows)),
Allows: make([]string, len(currentAllows)), Allows: make([]string, len(currentAllows)),
CrawlDelay: currentCrawlDelay, CrawlDelay: currentCrawlDelay,
} }
copy(rule.UserAgents, currentUserAgents)
copy(rule.Disallows, currentDisallows) copy(rule.Disallows, currentDisallows)
copy(rule.Allows, currentAllows) copy(rule.Allows, currentAllows)
rules = append(rules, rule) rules = append(rules, rule)
}
// Reset for next group // Reset for next group
currentUserAgents = nil currentUserAgents = nil
currentDisallows = nil currentDisallows = nil
@ -201,18 +199,17 @@ func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) {
// Don't forget the last group of rules // Don't forget the last group of rules
if len(currentUserAgents) > 0 { if len(currentUserAgents) > 0 {
for _, userAgent := range currentUserAgents {
rule := RobotsRule{ rule := RobotsRule{
UserAgent: userAgent, UserAgents: make([]string, len(currentUserAgents)),
Disallows: make([]string, len(currentDisallows)), Disallows: make([]string, len(currentDisallows)),
Allows: make([]string, len(currentAllows)), Allows: make([]string, len(currentAllows)),
CrawlDelay: currentCrawlDelay, CrawlDelay: currentCrawlDelay,
} }
copy(rule.UserAgents, currentUserAgents)
copy(rule.Disallows, currentDisallows) copy(rule.Disallows, currentDisallows)
copy(rule.Allows, currentAllows) copy(rule.Allows, currentAllows)
rules = append(rules, rule) rules = append(rules, rule)
} }
}
// Mark blacklisted user agents (those with "Disallow: /") // Mark blacklisted user agents (those with "Disallow: /")
for i := range rules { for i := range rules {
@ -237,45 +234,12 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
var anubisRules []AnubisRule var anubisRules []AnubisRule
ruleCounter := 0 ruleCounter := 0
// Group rules by their directives to create any blocks // Process each robots rule individually
blacklistGroups := make(map[string][]string) // key: directive signature, value: user agents
disallowGroups := make(map[string][]string) // key: path, value: user agents
crawlDelayGroups := make(map[string][]string) // key: delay, value: user agents
for _, robotsRule := range robotsRules { for _, robotsRule := range robotsRules {
userAgent := robotsRule.UserAgent userAgents := robotsRule.UserAgents
// Handle crawl delay groups // Handle crawl delay
if robotsRule.CrawlDelay > 0 && *crawlDelay > 0 { if robotsRule.CrawlDelay > 0 && *crawlDelay > 0 {
key := fmt.Sprintf("delay-%d", robotsRule.CrawlDelay)
crawlDelayGroups[key] = append(crawlDelayGroups[key], userAgent)
}
// Handle blacklisted user agents
if robotsRule.IsBlacklist {
key := "blacklist"
blacklistGroups[key] = append(blacklistGroups[key], userAgent)
}
// Handle specific disallow rules
for _, disallow := range robotsRule.Disallows {
if disallow == "/" {
continue // Already handled as blacklist above
}
disallowGroups[disallow] = append(disallowGroups[disallow], userAgent)
}
}
// Generate rules for crawl delays
// Sort keys for deterministic order
var crawlDelayKeys []string
for key := range crawlDelayGroups {
crawlDelayKeys = append(crawlDelayKeys, key)
}
sort.Strings(crawlDelayKeys)
for _, key := range crawlDelayKeys {
userAgents := crawlDelayGroups[key]
ruleCounter++ ruleCounter++
rule := AnubisRule{ rule := AnubisRule{
Name: fmt.Sprintf("%s-crawl-delay-%d", *policyName, ruleCounter), Name: fmt.Sprintf("%s-crawl-delay-%d", *policyName, ruleCounter),
@ -305,20 +269,11 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
Any: expressions, Any: expressions,
} }
} }
anubisRules = append(anubisRules, rule) anubisRules = append(anubisRules, rule)
} }
// Generate rules for blacklisted user agents // Handle blacklisted user agents
// Sort keys for deterministic order if robotsRule.IsBlacklist {
var blacklistKeys []string
for key := range blacklistGroups {
blacklistKeys = append(blacklistKeys, key)
}
sort.Strings(blacklistKeys)
for _, key := range blacklistKeys {
userAgents := blacklistGroups[key]
ruleCounter++ ruleCounter++
rule := AnubisRule{ rule := AnubisRule{
Name: fmt.Sprintf("%s-blacklist-%d", *policyName, ruleCounter), Name: fmt.Sprintf("%s-blacklist-%d", *policyName, ruleCounter),
@ -354,20 +309,15 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
Any: expressions, Any: expressions,
} }
} }
anubisRules = append(anubisRules, rule) anubisRules = append(anubisRules, rule)
} }
// Generate rules for specific disallow paths // Handle specific disallow rules
// Sort keys for deterministic order for _, disallow := range robotsRule.Disallows {
var disallowKeys []string if disallow == "/" {
for key := range disallowGroups { continue // Already handled as blacklist above
disallowKeys = append(disallowKeys, key)
} }
sort.Strings(disallowKeys)
for _, path := range disallowKeys {
userAgents := disallowGroups[path]
ruleCounter++ ruleCounter++
rule := AnubisRule{ rule := AnubisRule{
Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter), Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter),
@ -406,7 +356,7 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
Expression: &config.ExpressionOrList{ Expression: &config.ExpressionOrList{
All: []string{ All: []string{
fmt.Sprintf("userAgent.contains(%q)", ua), fmt.Sprintf("userAgent.contains(%q)", ua),
buildPathCondition(path), buildPathCondition(disallow),
}, },
}, },
} }
@ -416,7 +366,7 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
} }
// Add path condition // Add path condition
pathCondition := buildPathCondition(path) pathCondition := buildPathCondition(disallow)
conditions = append(conditions, pathCondition) conditions = append(conditions, pathCondition)
rule.Expression = &config.ExpressionOrList{ rule.Expression = &config.ExpressionOrList{
@ -425,6 +375,7 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
anubisRules = append(anubisRules, rule) anubisRules = append(anubisRules, rule)
} }
}
return anubisRules return anubisRules
} }

View File

@ -78,6 +78,12 @@ func TestDataFileConversion(t *testing.T) {
expectedFile: "complex.yaml", expectedFile: "complex.yaml",
options: TestOptions{format: "yaml", crawlDelayWeight: 5}, options: TestOptions{format: "yaml", crawlDelayWeight: 5},
}, },
{
name: "consecutive_user_agents",
robotsFile: "consecutive.robots.txt",
expectedFile: "consecutive.yaml",
options: TestOptions{format: "yaml", crawlDelayWeight: 3},
},
} }
for _, tc := range testCases { for _, tc := range testCases {

View File

@ -3,28 +3,28 @@
name: robots-txt-policy-crawl-delay-1 name: robots-txt-policy-crawl-delay-1
weight: weight:
adjust: 3 adjust: 3
- action: WEIGH - action: CHALLENGE
expression: userAgent.contains("Googlebot") expression: path.startsWith("/admin")
name: robots-txt-policy-crawl-delay-2 name: robots-txt-policy-disallow-2
weight: - action: DENY
adjust: 3 expression: userAgent.contains("BadBot")
name: robots-txt-policy-blacklist-3
- action: WEIGH - action: WEIGH
expression: userAgent.contains("SpamBot") expression: userAgent.contains("SpamBot")
name: robots-txt-policy-crawl-delay-3 name: robots-txt-policy-crawl-delay-4
weight: weight:
adjust: 3 adjust: 3
- action: DENY - action: DENY
expression: expression: userAgent.contains("SpamBot")
any: name: robots-txt-policy-blacklist-5
- userAgent.contains("BadBot") - action: WEIGH
- userAgent.contains("SpamBot") expression: userAgent.contains("Googlebot")
name: robots-txt-policy-blacklist-4 name: robots-txt-policy-crawl-delay-6
- action: CHALLENGE weight:
expression: path.startsWith("/admin") adjust: 3
name: robots-txt-policy-disallow-5
- action: CHALLENGE - action: CHALLENGE
expression: expression:
all: all:
- userAgent.contains("Googlebot") - userAgent.contains("Googlebot")
- path.startsWith("/search") - path.startsWith("/search")
name: robots-txt-policy-disallow-6 name: robots-txt-policy-disallow-7

View File

@ -1,68 +1,71 @@
- action: WEIGH - action: WEIGH
expression: userAgent.contains("Bingbot") expression: "true"
name: robots-txt-policy-crawl-delay-1 name: robots-txt-policy-crawl-delay-1
weight: weight:
adjust: 5 adjust: 5
- action: WEIGH
expression: userAgent.contains("Googlebot")
name: robots-txt-policy-crawl-delay-2
weight:
adjust: 5
- action: WEIGH
expression: userAgent.contains("SeoBot")
name: robots-txt-policy-crawl-delay-3
weight:
adjust: 5
- action: WEIGH
expression: "true"
name: robots-txt-policy-crawl-delay-4
weight:
adjust: 5
- action: DENY
expression:
any:
- userAgent.contains("BadBot")
- userAgent.contains("SeoBot")
name: robots-txt-policy-blacklist-5
- action: CHALLENGE - action: CHALLENGE
expression: expression: path.startsWith("/admin/")
all: name: robots-txt-policy-disallow-2
- userAgent.contains("TestBot")
- path.matches("^/.*/admin")
name: robots-txt-policy-disallow-6
- action: CHALLENGE
expression:
all:
- userAgent.contains("Bingbot")
- path.startsWith("/admin/")
name: robots-txt-policy-disallow-8
- action: CHALLENGE
expression: path.startsWith("/api/internal/")
name: robots-txt-policy-disallow-9
- action: CHALLENGE
expression:
all:
- userAgent.contains("TestBot")
- path.matches("^/file.\\.log")
name: robots-txt-policy-disallow-10
- action: CHALLENGE - action: CHALLENGE
expression: path.startsWith("/private/") expression: path.startsWith("/private/")
name: robots-txt-policy-disallow-11 name: robots-txt-policy-disallow-3
- action: CHALLENGE
expression: path.startsWith("/api/internal/")
name: robots-txt-policy-disallow-4
- action: WEIGH
expression: userAgent.contains("Googlebot")
name: robots-txt-policy-crawl-delay-5
weight:
adjust: 5
- action: CHALLENGE - action: CHALLENGE
expression: expression:
all: all:
- userAgent.contains("Googlebot") - userAgent.contains("Googlebot")
- path.startsWith("/search/") - path.startsWith("/search/")
name: robots-txt-policy-disallow-13 name: robots-txt-policy-disallow-6
- action: WEIGH
expression: userAgent.contains("Bingbot")
name: robots-txt-policy-crawl-delay-7
weight:
adjust: 5
- action: CHALLENGE - action: CHALLENGE
expression: expression:
all: all:
- userAgent.contains("Bingbot") - userAgent.contains("Bingbot")
- path.startsWith("/search/") - path.startsWith("/search/")
name: robots-txt-policy-disallow-14 name: robots-txt-policy-disallow-8
- action: CHALLENGE
expression:
all:
- userAgent.contains("Bingbot")
- path.startsWith("/admin/")
name: robots-txt-policy-disallow-9
- action: DENY
expression: userAgent.contains("BadBot")
name: robots-txt-policy-blacklist-10
- action: WEIGH
expression: userAgent.contains("SeoBot")
name: robots-txt-policy-crawl-delay-11
weight:
adjust: 5
- action: DENY
expression: userAgent.contains("SeoBot")
name: robots-txt-policy-blacklist-12
- action: CHALLENGE
expression:
all:
- userAgent.contains("TestBot")
- path.matches("^/.*/admin")
name: robots-txt-policy-disallow-13
- action: CHALLENGE - action: CHALLENGE
expression: expression:
all: all:
- userAgent.contains("TestBot") - userAgent.contains("TestBot")
- path.matches("^/temp.*\\.html") - path.matches("^/temp.*\\.html")
name: robots-txt-policy-disallow-14
- action: CHALLENGE
expression:
all:
- userAgent.contains("TestBot")
- path.matches("^/file.\\.log")
name: robots-txt-policy-disallow-15 name: robots-txt-policy-disallow-15

View File

@ -0,0 +1,25 @@
# Test consecutive user agents that should be grouped into any: blocks
User-agent: *
Disallow: /admin
Crawl-delay: 10
# Multiple consecutive user agents - should be grouped
User-agent: BadBot
User-agent: SpamBot
User-agent: EvilBot
Disallow: /
# Single user agent - should be separate
User-agent: GoodBot
Disallow: /private
# Multiple consecutive user agents with crawl delay
User-agent: SlowBot1
User-agent: SlowBot2
Crawl-delay: 5
# Multiple consecutive user agents with specific path
User-agent: SearchBot1
User-agent: SearchBot2
User-agent: SearchBot3
Disallow: /search

View File

@ -0,0 +1,47 @@
- action: WEIGH
expression: "true"
name: robots-txt-policy-crawl-delay-1
weight:
adjust: 3
- action: CHALLENGE
expression: path.startsWith("/admin")
name: robots-txt-policy-disallow-2
- action: DENY
expression:
any:
- userAgent.contains("BadBot")
- userAgent.contains("SpamBot")
- userAgent.contains("EvilBot")
name: robots-txt-policy-blacklist-3
- action: CHALLENGE
expression:
all:
- userAgent.contains("GoodBot")
- path.startsWith("/private")
name: robots-txt-policy-disallow-4
- action: WEIGH
expression:
any:
- userAgent.contains("SlowBot1")
- userAgent.contains("SlowBot2")
name: robots-txt-policy-crawl-delay-5
weight:
adjust: 3
- action: CHALLENGE
expression:
all:
- userAgent.contains("SearchBot1")
- path.startsWith("/search")
name: robots-txt-policy-disallow-7
- action: CHALLENGE
expression:
all:
- userAgent.contains("SearchBot2")
- path.startsWith("/search")
name: robots-txt-policy-disallow-8
- action: CHALLENGE
expression:
all:
- userAgent.contains("SearchBot3")
- path.startsWith("/search")
name: robots-txt-policy-disallow-9

View File

@ -1,12 +1,12 @@
- action: CHALLENGE - action: CHALLENGE
expression: path.matches("^/.*/private") expression: path.matches("^/search.*")
name: robots-txt-policy-disallow-1 name: robots-txt-policy-disallow-1
- action: CHALLENGE - action: CHALLENGE
expression: path.matches("^/admin/.*.action=delete") expression: path.matches("^/.*/private")
name: robots-txt-policy-disallow-2 name: robots-txt-policy-disallow-2
- action: CHALLENGE - action: CHALLENGE
expression: path.matches("^/file.\\.txt") expression: path.matches("^/file.\\.txt")
name: robots-txt-policy-disallow-3 name: robots-txt-policy-disallow-3
- action: CHALLENGE - action: CHALLENGE
expression: path.matches("^/search.*") expression: path.matches("^/admin/.*.action=delete")
name: robots-txt-policy-disallow-4 name: robots-txt-policy-disallow-4