Merge 2e7f37215c912bf0448fd8a45973683a37a4f96b into bca2e87e80bc7ef976fea9dc16e18ea7a69b9933

This commit is contained in:
Jason Cameron 2025-07-27 00:50:02 +00:00 committed by GitHub
commit 106f178e35
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 211 additions and 58 deletions

View File

@ -29,7 +29,7 @@ var (
)
type RobotsRule struct {
UserAgent string
UserAgents []string
Disallows []string
Allows []string
CrawlDelay int
@ -133,7 +133,10 @@ func main() {
func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) {
scanner := bufio.NewScanner(input)
var rules []RobotsRule
var currentRule *RobotsRule
var currentUserAgents []string
var currentDisallows []string
var currentAllows []string
var currentCrawlDelay int
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
@ -154,38 +157,58 @@ func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) {
switch directive {
case "user-agent":
// Start a new rule section
if currentRule != nil {
rules = append(rules, *currentRule)
}
currentRule = &RobotsRule{
UserAgent: value,
Disallows: make([]string, 0),
Allows: make([]string, 0),
// If we have accumulated rules with directives and encounter a new user-agent,
// flush the current rules
if len(currentUserAgents) > 0 && (len(currentDisallows) > 0 || len(currentAllows) > 0 || currentCrawlDelay > 0) {
rule := RobotsRule{
UserAgents: make([]string, len(currentUserAgents)),
Disallows: make([]string, len(currentDisallows)),
Allows: make([]string, len(currentAllows)),
CrawlDelay: currentCrawlDelay,
}
copy(rule.UserAgents, currentUserAgents)
copy(rule.Disallows, currentDisallows)
copy(rule.Allows, currentAllows)
rules = append(rules, rule)
// Reset for next group
currentUserAgents = nil
currentDisallows = nil
currentAllows = nil
currentCrawlDelay = 0
}
currentUserAgents = append(currentUserAgents, value)
case "disallow":
if currentRule != nil && value != "" {
currentRule.Disallows = append(currentRule.Disallows, value)
if len(currentUserAgents) > 0 && value != "" {
currentDisallows = append(currentDisallows, value)
}
case "allow":
if currentRule != nil && value != "" {
currentRule.Allows = append(currentRule.Allows, value)
if len(currentUserAgents) > 0 && value != "" {
currentAllows = append(currentAllows, value)
}
case "crawl-delay":
if currentRule != nil {
if len(currentUserAgents) > 0 {
if delay, err := parseIntSafe(value); err == nil {
currentRule.CrawlDelay = delay
currentCrawlDelay = delay
}
}
}
}
// Don't forget the last rule
if currentRule != nil {
rules = append(rules, *currentRule)
// Don't forget the last group of rules
if len(currentUserAgents) > 0 {
rule := RobotsRule{
UserAgents: make([]string, len(currentUserAgents)),
Disallows: make([]string, len(currentDisallows)),
Allows: make([]string, len(currentAllows)),
CrawlDelay: currentCrawlDelay,
}
copy(rule.UserAgents, currentUserAgents)
copy(rule.Disallows, currentDisallows)
copy(rule.Allows, currentAllows)
rules = append(rules, rule)
}
// Mark blacklisted user agents (those with "Disallow: /")
@ -211,10 +234,11 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
var anubisRules []AnubisRule
ruleCounter := 0
// Process each robots rule individually
for _, robotsRule := range robotsRules {
userAgent := robotsRule.UserAgent
userAgents := robotsRule.UserAgents
// Handle crawl delay as weight adjustment (do this first before any continues)
// Handle crawl delay
if robotsRule.CrawlDelay > 0 && *crawlDelay > 0 {
ruleCounter++
rule := AnubisRule{
@ -223,20 +247,32 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
Weight: &config.Weight{Adjust: *crawlDelay},
}
if userAgent == "*" {
if len(userAgents) == 1 && userAgents[0] == "*" {
rule.Expression = &config.ExpressionOrList{
All: []string{"true"}, // Always applies
}
} else {
} else if len(userAgents) == 1 {
rule.Expression = &config.ExpressionOrList{
All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)},
All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgents[0])},
}
} else {
// Multiple user agents - use any block
var expressions []string
for _, ua := range userAgents {
if ua == "*" {
expressions = append(expressions, "true")
} else {
expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua))
}
}
rule.Expression = &config.ExpressionOrList{
Any: expressions,
}
}
anubisRules = append(anubisRules, rule)
}
// Handle blacklisted user agents (complete deny/challenge)
// Handle blacklisted user agents
if robotsRule.IsBlacklist {
ruleCounter++
rule := AnubisRule{
@ -244,21 +280,36 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
Action: *userAgentDeny,
}
if userAgent == "*" {
// This would block everything - convert to a weight adjustment instead
rule.Name = fmt.Sprintf("%s-global-restriction-%d", *policyName, ruleCounter)
rule.Action = "WEIGH"
rule.Weight = &config.Weight{Adjust: 20} // Increase difficulty significantly
rule.Expression = &config.ExpressionOrList{
All: []string{"true"}, // Always applies
if len(userAgents) == 1 {
userAgent := userAgents[0]
if userAgent == "*" {
// This would block everything - convert to a weight adjustment instead
rule.Name = fmt.Sprintf("%s-global-restriction-%d", *policyName, ruleCounter)
rule.Action = "WEIGH"
rule.Weight = &config.Weight{Adjust: 20} // Increase difficulty significantly
rule.Expression = &config.ExpressionOrList{
All: []string{"true"}, // Always applies
}
} else {
rule.Expression = &config.ExpressionOrList{
All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)},
}
}
} else {
// Multiple user agents - use any block
var expressions []string
for _, ua := range userAgents {
if ua == "*" {
expressions = append(expressions, "true")
} else {
expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua))
}
}
rule.Expression = &config.ExpressionOrList{
All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)},
Any: expressions,
}
}
anubisRules = append(anubisRules, rule)
continue
}
// Handle specific disallow rules
@ -276,9 +327,33 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
// Build CEL expression
var conditions []string
// Add user agent condition if not wildcard
if userAgent != "*" {
conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgent))
// Add user agent conditions
if len(userAgents) == 1 && userAgents[0] == "*" {
// Wildcard user agent - no user agent condition needed
} else if len(userAgents) == 1 {
conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgents[0]))
} else {
// For multiple user agents, we need to use a more complex expression
// This is a limitation - we can't easily combine any for user agents with all for path
// So we'll create separate rules for each user agent
for _, ua := range userAgents {
if ua == "*" {
continue // Skip wildcard as it's handled separately
}
ruleCounter++
subRule := AnubisRule{
Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter),
Action: *baseAction,
Expression: &config.ExpressionOrList{
All: []string{
fmt.Sprintf("userAgent.contains(%q)", ua),
buildPathCondition(disallow),
},
},
}
anubisRules = append(anubisRules, subRule)
}
continue
}
// Add path condition
@ -291,7 +366,6 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
anubisRules = append(anubisRules, rule)
}
}
return anubisRules

View File

@ -78,6 +78,12 @@ func TestDataFileConversion(t *testing.T) {
expectedFile: "complex.yaml",
options: TestOptions{format: "yaml", crawlDelayWeight: 5},
},
{
name: "consecutive_user_agents",
robotsFile: "consecutive.robots.txt",
expectedFile: "consecutive.yaml",
options: TestOptions{format: "yaml", crawlDelayWeight: 3},
},
}
for _, tc := range testCases {

View File

@ -25,6 +25,6 @@
- action: CHALLENGE
expression:
all:
- userAgent.contains("Googlebot")
- path.startsWith("/search")
name: robots-txt-policy-disallow-7
- userAgent.contains("Googlebot")
- path.startsWith("/search")
name: robots-txt-policy-disallow-7

View File

@ -20,8 +20,8 @@
- action: CHALLENGE
expression:
all:
- userAgent.contains("Googlebot")
- path.startsWith("/search/")
- userAgent.contains("Googlebot")
- path.startsWith("/search/")
name: robots-txt-policy-disallow-6
- action: WEIGH
expression: userAgent.contains("Bingbot")
@ -31,14 +31,14 @@
- action: CHALLENGE
expression:
all:
- userAgent.contains("Bingbot")
- path.startsWith("/search/")
- userAgent.contains("Bingbot")
- path.startsWith("/search/")
name: robots-txt-policy-disallow-8
- action: CHALLENGE
expression:
all:
- userAgent.contains("Bingbot")
- path.startsWith("/admin/")
- userAgent.contains("Bingbot")
- path.startsWith("/admin/")
name: robots-txt-policy-disallow-9
- action: DENY
expression: userAgent.contains("BadBot")
@ -54,18 +54,18 @@
- action: CHALLENGE
expression:
all:
- userAgent.contains("TestBot")
- path.matches("^/.*/admin")
- userAgent.contains("TestBot")
- path.matches("^/.*/admin")
name: robots-txt-policy-disallow-13
- action: CHALLENGE
expression:
all:
- userAgent.contains("TestBot")
- path.matches("^/temp.*\\.html")
- userAgent.contains("TestBot")
- path.matches("^/temp.*\\.html")
name: robots-txt-policy-disallow-14
- action: CHALLENGE
expression:
all:
- userAgent.contains("TestBot")
- path.matches("^/file.\\.log")
- userAgent.contains("TestBot")
- path.matches("^/file.\\.log")
name: robots-txt-policy-disallow-15

View File

@ -0,0 +1,25 @@
# Test consecutive user agents that should be grouped into any: blocks
User-agent: *
Disallow: /admin
Crawl-delay: 10
# Multiple consecutive user agents - should be grouped
User-agent: BadBot
User-agent: SpamBot
User-agent: EvilBot
Disallow: /
# Single user agent - should be separate
User-agent: GoodBot
Disallow: /private
# Multiple consecutive user agents with crawl delay
User-agent: SlowBot1
User-agent: SlowBot2
Crawl-delay: 5
# Multiple consecutive user agents with specific path
User-agent: SearchBot1
User-agent: SearchBot2
User-agent: SearchBot3
Disallow: /search

View File

@ -0,0 +1,47 @@
- action: WEIGH
expression: "true"
name: robots-txt-policy-crawl-delay-1
weight:
adjust: 3
- action: CHALLENGE
expression: path.startsWith("/admin")
name: robots-txt-policy-disallow-2
- action: DENY
expression:
any:
- userAgent.contains("BadBot")
- userAgent.contains("SpamBot")
- userAgent.contains("EvilBot")
name: robots-txt-policy-blacklist-3
- action: CHALLENGE
expression:
all:
- userAgent.contains("GoodBot")
- path.startsWith("/private")
name: robots-txt-policy-disallow-4
- action: WEIGH
expression:
any:
- userAgent.contains("SlowBot1")
- userAgent.contains("SlowBot2")
name: robots-txt-policy-crawl-delay-5
weight:
adjust: 3
- action: CHALLENGE
expression:
all:
- userAgent.contains("SearchBot1")
- path.startsWith("/search")
name: robots-txt-policy-disallow-7
- action: CHALLENGE
expression:
all:
- userAgent.contains("SearchBot2")
- path.startsWith("/search")
name: robots-txt-policy-disallow-8
- action: CHALLENGE
expression:
all:
- userAgent.contains("SearchBot3")
- path.startsWith("/search")
name: robots-txt-policy-disallow-9

View File

@ -1,12 +1,12 @@
[
{
"action": "CHALLENGE",
"expression": "path.startsWith(\"/admin/\")",
"name": "robots-txt-policy-disallow-1"
"name": "robots-txt-policy-disallow-1",
"action": "CHALLENGE"
},
{
"action": "CHALLENGE",
"expression": "path.startsWith(\"/private\")",
"name": "robots-txt-policy-disallow-2"
"name": "robots-txt-policy-disallow-2",
"action": "CHALLENGE"
}
]

View File

@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- The [Thoth client](https://anubis.techaro.lol/docs/admin/thoth) is now public in the repo instead of being an internal package.
- [Custom-AsyncHttpClient](https://github.com/AsyncHttpClient/async-http-client)'s default User-Agent has an increased weight by default ([#852](https://github.com/TecharoHQ/anubis/issues/852)).
- The [`segments`](./admin/configuration/expressions.mdx#segments) function was added for splitting a path into its slash-separated segments.
- Fixed `robots2policy` to properly group consecutive user agents into `any:` instead of only processing the last one
## v1.21.3: Minfilia Warde - Echo 3