mirror of
https://github.com/TecharoHQ/anubis.git
synced 2025-08-03 17:59:24 -04:00
fix: collate
This commit is contained in:
parent
291ed2a084
commit
e51b4bd965
@ -10,7 +10,6 @@ import (
|
|||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"regexp"
|
"regexp"
|
||||||
"sort"
|
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/TecharoHQ/anubis/lib/policy/config"
|
"github.com/TecharoHQ/anubis/lib/policy/config"
|
||||||
@ -30,7 +29,7 @@ var (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type RobotsRule struct {
|
type RobotsRule struct {
|
||||||
UserAgent string
|
UserAgents []string
|
||||||
Disallows []string
|
Disallows []string
|
||||||
Allows []string
|
Allows []string
|
||||||
CrawlDelay int
|
CrawlDelay int
|
||||||
@ -161,17 +160,16 @@ func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) {
|
|||||||
// If we have accumulated rules with directives and encounter a new user-agent,
|
// If we have accumulated rules with directives and encounter a new user-agent,
|
||||||
// flush the current rules
|
// flush the current rules
|
||||||
if len(currentUserAgents) > 0 && (len(currentDisallows) > 0 || len(currentAllows) > 0 || currentCrawlDelay > 0) {
|
if len(currentUserAgents) > 0 && (len(currentDisallows) > 0 || len(currentAllows) > 0 || currentCrawlDelay > 0) {
|
||||||
for _, userAgent := range currentUserAgents {
|
rule := RobotsRule{
|
||||||
rule := RobotsRule{
|
UserAgents: make([]string, len(currentUserAgents)),
|
||||||
UserAgent: userAgent,
|
Disallows: make([]string, len(currentDisallows)),
|
||||||
Disallows: make([]string, len(currentDisallows)),
|
Allows: make([]string, len(currentAllows)),
|
||||||
Allows: make([]string, len(currentAllows)),
|
CrawlDelay: currentCrawlDelay,
|
||||||
CrawlDelay: currentCrawlDelay,
|
|
||||||
}
|
|
||||||
copy(rule.Disallows, currentDisallows)
|
|
||||||
copy(rule.Allows, currentAllows)
|
|
||||||
rules = append(rules, rule)
|
|
||||||
}
|
}
|
||||||
|
copy(rule.UserAgents, currentUserAgents)
|
||||||
|
copy(rule.Disallows, currentDisallows)
|
||||||
|
copy(rule.Allows, currentAllows)
|
||||||
|
rules = append(rules, rule)
|
||||||
// Reset for next group
|
// Reset for next group
|
||||||
currentUserAgents = nil
|
currentUserAgents = nil
|
||||||
currentDisallows = nil
|
currentDisallows = nil
|
||||||
@ -201,17 +199,16 @@ func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) {
|
|||||||
|
|
||||||
// Don't forget the last group of rules
|
// Don't forget the last group of rules
|
||||||
if len(currentUserAgents) > 0 {
|
if len(currentUserAgents) > 0 {
|
||||||
for _, userAgent := range currentUserAgents {
|
rule := RobotsRule{
|
||||||
rule := RobotsRule{
|
UserAgents: make([]string, len(currentUserAgents)),
|
||||||
UserAgent: userAgent,
|
Disallows: make([]string, len(currentDisallows)),
|
||||||
Disallows: make([]string, len(currentDisallows)),
|
Allows: make([]string, len(currentAllows)),
|
||||||
Allows: make([]string, len(currentAllows)),
|
CrawlDelay: currentCrawlDelay,
|
||||||
CrawlDelay: currentCrawlDelay,
|
|
||||||
}
|
|
||||||
copy(rule.Disallows, currentDisallows)
|
|
||||||
copy(rule.Allows, currentAllows)
|
|
||||||
rules = append(rules, rule)
|
|
||||||
}
|
}
|
||||||
|
copy(rule.UserAgents, currentUserAgents)
|
||||||
|
copy(rule.Disallows, currentDisallows)
|
||||||
|
copy(rule.Allows, currentAllows)
|
||||||
|
rules = append(rules, rule)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Mark blacklisted user agents (those with "Disallow: /")
|
// Mark blacklisted user agents (those with "Disallow: /")
|
||||||
@ -237,24 +234,82 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
|
|||||||
var anubisRules []AnubisRule
|
var anubisRules []AnubisRule
|
||||||
ruleCounter := 0
|
ruleCounter := 0
|
||||||
|
|
||||||
// Group rules by their directives to create any blocks
|
// Process each robots rule individually
|
||||||
blacklistGroups := make(map[string][]string) // key: directive signature, value: user agents
|
|
||||||
disallowGroups := make(map[string][]string) // key: path, value: user agents
|
|
||||||
crawlDelayGroups := make(map[string][]string) // key: delay, value: user agents
|
|
||||||
|
|
||||||
for _, robotsRule := range robotsRules {
|
for _, robotsRule := range robotsRules {
|
||||||
userAgent := robotsRule.UserAgent
|
userAgents := robotsRule.UserAgents
|
||||||
|
|
||||||
// Handle crawl delay groups
|
// Handle crawl delay
|
||||||
if robotsRule.CrawlDelay > 0 && *crawlDelay > 0 {
|
if robotsRule.CrawlDelay > 0 && *crawlDelay > 0 {
|
||||||
key := fmt.Sprintf("delay-%d", robotsRule.CrawlDelay)
|
ruleCounter++
|
||||||
crawlDelayGroups[key] = append(crawlDelayGroups[key], userAgent)
|
rule := AnubisRule{
|
||||||
|
Name: fmt.Sprintf("%s-crawl-delay-%d", *policyName, ruleCounter),
|
||||||
|
Action: "WEIGH",
|
||||||
|
Weight: &config.Weight{Adjust: *crawlDelay},
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(userAgents) == 1 && userAgents[0] == "*" {
|
||||||
|
rule.Expression = &config.ExpressionOrList{
|
||||||
|
All: []string{"true"}, // Always applies
|
||||||
|
}
|
||||||
|
} else if len(userAgents) == 1 {
|
||||||
|
rule.Expression = &config.ExpressionOrList{
|
||||||
|
All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgents[0])},
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Multiple user agents - use any block
|
||||||
|
var expressions []string
|
||||||
|
for _, ua := range userAgents {
|
||||||
|
if ua == "*" {
|
||||||
|
expressions = append(expressions, "true")
|
||||||
|
} else {
|
||||||
|
expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
rule.Expression = &config.ExpressionOrList{
|
||||||
|
Any: expressions,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
anubisRules = append(anubisRules, rule)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Handle blacklisted user agents
|
// Handle blacklisted user agents
|
||||||
if robotsRule.IsBlacklist {
|
if robotsRule.IsBlacklist {
|
||||||
key := "blacklist"
|
ruleCounter++
|
||||||
blacklistGroups[key] = append(blacklistGroups[key], userAgent)
|
rule := AnubisRule{
|
||||||
|
Name: fmt.Sprintf("%s-blacklist-%d", *policyName, ruleCounter),
|
||||||
|
Action: *userAgentDeny,
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(userAgents) == 1 {
|
||||||
|
userAgent := userAgents[0]
|
||||||
|
if userAgent == "*" {
|
||||||
|
// This would block everything - convert to a weight adjustment instead
|
||||||
|
rule.Name = fmt.Sprintf("%s-global-restriction-%d", *policyName, ruleCounter)
|
||||||
|
rule.Action = "WEIGH"
|
||||||
|
rule.Weight = &config.Weight{Adjust: 20} // Increase difficulty significantly
|
||||||
|
rule.Expression = &config.ExpressionOrList{
|
||||||
|
All: []string{"true"}, // Always applies
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
rule.Expression = &config.ExpressionOrList{
|
||||||
|
All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Multiple user agents - use any block
|
||||||
|
var expressions []string
|
||||||
|
for _, ua := range userAgents {
|
||||||
|
if ua == "*" {
|
||||||
|
expressions = append(expressions, "true")
|
||||||
|
} else {
|
||||||
|
expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
rule.Expression = &config.ExpressionOrList{
|
||||||
|
Any: expressions,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
anubisRules = append(anubisRules, rule)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Handle specific disallow rules
|
// Handle specific disallow rules
|
||||||
@ -262,168 +317,64 @@ func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
|
|||||||
if disallow == "/" {
|
if disallow == "/" {
|
||||||
continue // Already handled as blacklist above
|
continue // Already handled as blacklist above
|
||||||
}
|
}
|
||||||
disallowGroups[disallow] = append(disallowGroups[disallow], userAgent)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Generate rules for crawl delays
|
ruleCounter++
|
||||||
// Sort keys for deterministic order
|
rule := AnubisRule{
|
||||||
var crawlDelayKeys []string
|
Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter),
|
||||||
for key := range crawlDelayGroups {
|
Action: *baseAction,
|
||||||
crawlDelayKeys = append(crawlDelayKeys, key)
|
|
||||||
}
|
|
||||||
sort.Strings(crawlDelayKeys)
|
|
||||||
|
|
||||||
for _, key := range crawlDelayKeys {
|
|
||||||
userAgents := crawlDelayGroups[key]
|
|
||||||
ruleCounter++
|
|
||||||
rule := AnubisRule{
|
|
||||||
Name: fmt.Sprintf("%s-crawl-delay-%d", *policyName, ruleCounter),
|
|
||||||
Action: "WEIGH",
|
|
||||||
Weight: &config.Weight{Adjust: *crawlDelay},
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(userAgents) == 1 && userAgents[0] == "*" {
|
|
||||||
rule.Expression = &config.ExpressionOrList{
|
|
||||||
All: []string{"true"}, // Always applies
|
|
||||||
}
|
}
|
||||||
} else if len(userAgents) == 1 {
|
|
||||||
rule.Expression = &config.ExpressionOrList{
|
|
||||||
All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgents[0])},
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Multiple user agents - use any block
|
|
||||||
var expressions []string
|
|
||||||
for _, ua := range userAgents {
|
|
||||||
if ua == "*" {
|
|
||||||
expressions = append(expressions, "true")
|
|
||||||
} else {
|
|
||||||
expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
rule.Expression = &config.ExpressionOrList{
|
|
||||||
Any: expressions,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
anubisRules = append(anubisRules, rule)
|
// Build CEL expression
|
||||||
}
|
var conditions []string
|
||||||
|
|
||||||
// Generate rules for blacklisted user agents
|
// Add user agent conditions
|
||||||
// Sort keys for deterministic order
|
if len(userAgents) == 1 && userAgents[0] == "*" {
|
||||||
var blacklistKeys []string
|
// Wildcard user agent - no user agent condition needed
|
||||||
for key := range blacklistGroups {
|
} else if len(userAgents) == 1 {
|
||||||
blacklistKeys = append(blacklistKeys, key)
|
conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgents[0]))
|
||||||
}
|
|
||||||
sort.Strings(blacklistKeys)
|
|
||||||
|
|
||||||
for _, key := range blacklistKeys {
|
|
||||||
userAgents := blacklistGroups[key]
|
|
||||||
ruleCounter++
|
|
||||||
rule := AnubisRule{
|
|
||||||
Name: fmt.Sprintf("%s-blacklist-%d", *policyName, ruleCounter),
|
|
||||||
Action: *userAgentDeny,
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(userAgents) == 1 {
|
|
||||||
userAgent := userAgents[0]
|
|
||||||
if userAgent == "*" {
|
|
||||||
// This would block everything - convert to a weight adjustment instead
|
|
||||||
rule.Name = fmt.Sprintf("%s-global-restriction-%d", *policyName, ruleCounter)
|
|
||||||
rule.Action = "WEIGH"
|
|
||||||
rule.Weight = &config.Weight{Adjust: 20} // Increase difficulty significantly
|
|
||||||
rule.Expression = &config.ExpressionOrList{
|
|
||||||
All: []string{"true"}, // Always applies
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
rule.Expression = &config.ExpressionOrList{
|
// Multiple user agents - use any block for user agents
|
||||||
All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)},
|
var uaExpressions []string
|
||||||
|
for _, ua := range userAgents {
|
||||||
|
if ua == "*" {
|
||||||
|
uaExpressions = append(uaExpressions, "true")
|
||||||
|
} else {
|
||||||
|
uaExpressions = append(uaExpressions, fmt.Sprintf("userAgent.contains(%q)", ua))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
// For multiple user agents, we need to use a more complex expression
|
||||||
} else {
|
// This is a limitation - we can't easily combine any for user agents with all for path
|
||||||
// Multiple user agents - use any block
|
// So we'll create separate rules for each user agent
|
||||||
var expressions []string
|
for _, ua := range userAgents {
|
||||||
for _, ua := range userAgents {
|
if ua == "*" {
|
||||||
if ua == "*" {
|
continue // Skip wildcard as it's handled separately
|
||||||
expressions = append(expressions, "true")
|
}
|
||||||
} else {
|
ruleCounter++
|
||||||
expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua))
|
subRule := AnubisRule{
|
||||||
}
|
Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter),
|
||||||
}
|
Action: *baseAction,
|
||||||
rule.Expression = &config.ExpressionOrList{
|
Expression: &config.ExpressionOrList{
|
||||||
Any: expressions,
|
All: []string{
|
||||||
}
|
fmt.Sprintf("userAgent.contains(%q)", ua),
|
||||||
}
|
buildPathCondition(disallow),
|
||||||
|
},
|
||||||
anubisRules = append(anubisRules, rule)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Generate rules for specific disallow paths
|
|
||||||
// Sort keys for deterministic order
|
|
||||||
var disallowKeys []string
|
|
||||||
for key := range disallowGroups {
|
|
||||||
disallowKeys = append(disallowKeys, key)
|
|
||||||
}
|
|
||||||
sort.Strings(disallowKeys)
|
|
||||||
|
|
||||||
for _, path := range disallowKeys {
|
|
||||||
userAgents := disallowGroups[path]
|
|
||||||
ruleCounter++
|
|
||||||
rule := AnubisRule{
|
|
||||||
Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter),
|
|
||||||
Action: *baseAction,
|
|
||||||
}
|
|
||||||
|
|
||||||
// Build CEL expression
|
|
||||||
var conditions []string
|
|
||||||
|
|
||||||
// Add user agent conditions
|
|
||||||
if len(userAgents) == 1 && userAgents[0] == "*" {
|
|
||||||
// Wildcard user agent - no user agent condition needed
|
|
||||||
} else if len(userAgents) == 1 {
|
|
||||||
conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgents[0]))
|
|
||||||
} else {
|
|
||||||
// Multiple user agents - use any block for user agents
|
|
||||||
var uaExpressions []string
|
|
||||||
for _, ua := range userAgents {
|
|
||||||
if ua == "*" {
|
|
||||||
uaExpressions = append(uaExpressions, "true")
|
|
||||||
} else {
|
|
||||||
uaExpressions = append(uaExpressions, fmt.Sprintf("userAgent.contains(%q)", ua))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// For multiple user agents, we need to use a more complex expression
|
|
||||||
// This is a limitation - we can't easily combine any for user agents with all for path
|
|
||||||
// So we'll create separate rules for each user agent
|
|
||||||
for _, ua := range userAgents {
|
|
||||||
if ua == "*" {
|
|
||||||
continue // Skip wildcard as it's handled separately
|
|
||||||
}
|
|
||||||
ruleCounter++
|
|
||||||
subRule := AnubisRule{
|
|
||||||
Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter),
|
|
||||||
Action: *baseAction,
|
|
||||||
Expression: &config.ExpressionOrList{
|
|
||||||
All: []string{
|
|
||||||
fmt.Sprintf("userAgent.contains(%q)", ua),
|
|
||||||
buildPathCondition(path),
|
|
||||||
},
|
},
|
||||||
},
|
}
|
||||||
|
anubisRules = append(anubisRules, subRule)
|
||||||
}
|
}
|
||||||
anubisRules = append(anubisRules, subRule)
|
continue
|
||||||
}
|
}
|
||||||
continue
|
|
||||||
|
// Add path condition
|
||||||
|
pathCondition := buildPathCondition(disallow)
|
||||||
|
conditions = append(conditions, pathCondition)
|
||||||
|
|
||||||
|
rule.Expression = &config.ExpressionOrList{
|
||||||
|
All: conditions,
|
||||||
|
}
|
||||||
|
|
||||||
|
anubisRules = append(anubisRules, rule)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add path condition
|
|
||||||
pathCondition := buildPathCondition(path)
|
|
||||||
conditions = append(conditions, pathCondition)
|
|
||||||
|
|
||||||
rule.Expression = &config.ExpressionOrList{
|
|
||||||
All: conditions,
|
|
||||||
}
|
|
||||||
|
|
||||||
anubisRules = append(anubisRules, rule)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return anubisRules
|
return anubisRules
|
||||||
|
@ -78,6 +78,12 @@ func TestDataFileConversion(t *testing.T) {
|
|||||||
expectedFile: "complex.yaml",
|
expectedFile: "complex.yaml",
|
||||||
options: TestOptions{format: "yaml", crawlDelayWeight: 5},
|
options: TestOptions{format: "yaml", crawlDelayWeight: 5},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "consecutive_user_agents",
|
||||||
|
robotsFile: "consecutive.robots.txt",
|
||||||
|
expectedFile: "consecutive.yaml",
|
||||||
|
options: TestOptions{format: "yaml", crawlDelayWeight: 3},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tc := range testCases {
|
for _, tc := range testCases {
|
||||||
|
30
cmd/robots2policy/testdata/blacklist.yaml
vendored
30
cmd/robots2policy/testdata/blacklist.yaml
vendored
@ -3,28 +3,28 @@
|
|||||||
name: robots-txt-policy-crawl-delay-1
|
name: robots-txt-policy-crawl-delay-1
|
||||||
weight:
|
weight:
|
||||||
adjust: 3
|
adjust: 3
|
||||||
- action: WEIGH
|
- action: CHALLENGE
|
||||||
expression: userAgent.contains("Googlebot")
|
expression: path.startsWith("/admin")
|
||||||
name: robots-txt-policy-crawl-delay-2
|
name: robots-txt-policy-disallow-2
|
||||||
weight:
|
- action: DENY
|
||||||
adjust: 3
|
expression: userAgent.contains("BadBot")
|
||||||
|
name: robots-txt-policy-blacklist-3
|
||||||
- action: WEIGH
|
- action: WEIGH
|
||||||
expression: userAgent.contains("SpamBot")
|
expression: userAgent.contains("SpamBot")
|
||||||
name: robots-txt-policy-crawl-delay-3
|
name: robots-txt-policy-crawl-delay-4
|
||||||
weight:
|
weight:
|
||||||
adjust: 3
|
adjust: 3
|
||||||
- action: DENY
|
- action: DENY
|
||||||
expression:
|
expression: userAgent.contains("SpamBot")
|
||||||
any:
|
name: robots-txt-policy-blacklist-5
|
||||||
- userAgent.contains("BadBot")
|
- action: WEIGH
|
||||||
- userAgent.contains("SpamBot")
|
expression: userAgent.contains("Googlebot")
|
||||||
name: robots-txt-policy-blacklist-4
|
name: robots-txt-policy-crawl-delay-6
|
||||||
- action: CHALLENGE
|
weight:
|
||||||
expression: path.startsWith("/admin")
|
adjust: 3
|
||||||
name: robots-txt-policy-disallow-5
|
|
||||||
- action: CHALLENGE
|
- action: CHALLENGE
|
||||||
expression:
|
expression:
|
||||||
all:
|
all:
|
||||||
- userAgent.contains("Googlebot")
|
- userAgent.contains("Googlebot")
|
||||||
- path.startsWith("/search")
|
- path.startsWith("/search")
|
||||||
name: robots-txt-policy-disallow-6
|
name: robots-txt-policy-disallow-7
|
||||||
|
93
cmd/robots2policy/testdata/complex.yaml
vendored
93
cmd/robots2policy/testdata/complex.yaml
vendored
@ -1,68 +1,71 @@
|
|||||||
- action: WEIGH
|
- action: WEIGH
|
||||||
expression: userAgent.contains("Bingbot")
|
expression: "true"
|
||||||
name: robots-txt-policy-crawl-delay-1
|
name: robots-txt-policy-crawl-delay-1
|
||||||
weight:
|
weight:
|
||||||
adjust: 5
|
adjust: 5
|
||||||
- action: WEIGH
|
|
||||||
expression: userAgent.contains("Googlebot")
|
|
||||||
name: robots-txt-policy-crawl-delay-2
|
|
||||||
weight:
|
|
||||||
adjust: 5
|
|
||||||
- action: WEIGH
|
|
||||||
expression: userAgent.contains("SeoBot")
|
|
||||||
name: robots-txt-policy-crawl-delay-3
|
|
||||||
weight:
|
|
||||||
adjust: 5
|
|
||||||
- action: WEIGH
|
|
||||||
expression: "true"
|
|
||||||
name: robots-txt-policy-crawl-delay-4
|
|
||||||
weight:
|
|
||||||
adjust: 5
|
|
||||||
- action: DENY
|
|
||||||
expression:
|
|
||||||
any:
|
|
||||||
- userAgent.contains("BadBot")
|
|
||||||
- userAgent.contains("SeoBot")
|
|
||||||
name: robots-txt-policy-blacklist-5
|
|
||||||
- action: CHALLENGE
|
- action: CHALLENGE
|
||||||
expression:
|
expression: path.startsWith("/admin/")
|
||||||
all:
|
name: robots-txt-policy-disallow-2
|
||||||
- userAgent.contains("TestBot")
|
|
||||||
- path.matches("^/.*/admin")
|
|
||||||
name: robots-txt-policy-disallow-6
|
|
||||||
- action: CHALLENGE
|
|
||||||
expression:
|
|
||||||
all:
|
|
||||||
- userAgent.contains("Bingbot")
|
|
||||||
- path.startsWith("/admin/")
|
|
||||||
name: robots-txt-policy-disallow-8
|
|
||||||
- action: CHALLENGE
|
|
||||||
expression: path.startsWith("/api/internal/")
|
|
||||||
name: robots-txt-policy-disallow-9
|
|
||||||
- action: CHALLENGE
|
|
||||||
expression:
|
|
||||||
all:
|
|
||||||
- userAgent.contains("TestBot")
|
|
||||||
- path.matches("^/file.\\.log")
|
|
||||||
name: robots-txt-policy-disallow-10
|
|
||||||
- action: CHALLENGE
|
- action: CHALLENGE
|
||||||
expression: path.startsWith("/private/")
|
expression: path.startsWith("/private/")
|
||||||
name: robots-txt-policy-disallow-11
|
name: robots-txt-policy-disallow-3
|
||||||
|
- action: CHALLENGE
|
||||||
|
expression: path.startsWith("/api/internal/")
|
||||||
|
name: robots-txt-policy-disallow-4
|
||||||
|
- action: WEIGH
|
||||||
|
expression: userAgent.contains("Googlebot")
|
||||||
|
name: robots-txt-policy-crawl-delay-5
|
||||||
|
weight:
|
||||||
|
adjust: 5
|
||||||
- action: CHALLENGE
|
- action: CHALLENGE
|
||||||
expression:
|
expression:
|
||||||
all:
|
all:
|
||||||
- userAgent.contains("Googlebot")
|
- userAgent.contains("Googlebot")
|
||||||
- path.startsWith("/search/")
|
- path.startsWith("/search/")
|
||||||
name: robots-txt-policy-disallow-13
|
name: robots-txt-policy-disallow-6
|
||||||
|
- action: WEIGH
|
||||||
|
expression: userAgent.contains("Bingbot")
|
||||||
|
name: robots-txt-policy-crawl-delay-7
|
||||||
|
weight:
|
||||||
|
adjust: 5
|
||||||
- action: CHALLENGE
|
- action: CHALLENGE
|
||||||
expression:
|
expression:
|
||||||
all:
|
all:
|
||||||
- userAgent.contains("Bingbot")
|
- userAgent.contains("Bingbot")
|
||||||
- path.startsWith("/search/")
|
- path.startsWith("/search/")
|
||||||
name: robots-txt-policy-disallow-14
|
name: robots-txt-policy-disallow-8
|
||||||
|
- action: CHALLENGE
|
||||||
|
expression:
|
||||||
|
all:
|
||||||
|
- userAgent.contains("Bingbot")
|
||||||
|
- path.startsWith("/admin/")
|
||||||
|
name: robots-txt-policy-disallow-9
|
||||||
|
- action: DENY
|
||||||
|
expression: userAgent.contains("BadBot")
|
||||||
|
name: robots-txt-policy-blacklist-10
|
||||||
|
- action: WEIGH
|
||||||
|
expression: userAgent.contains("SeoBot")
|
||||||
|
name: robots-txt-policy-crawl-delay-11
|
||||||
|
weight:
|
||||||
|
adjust: 5
|
||||||
|
- action: DENY
|
||||||
|
expression: userAgent.contains("SeoBot")
|
||||||
|
name: robots-txt-policy-blacklist-12
|
||||||
|
- action: CHALLENGE
|
||||||
|
expression:
|
||||||
|
all:
|
||||||
|
- userAgent.contains("TestBot")
|
||||||
|
- path.matches("^/.*/admin")
|
||||||
|
name: robots-txt-policy-disallow-13
|
||||||
- action: CHALLENGE
|
- action: CHALLENGE
|
||||||
expression:
|
expression:
|
||||||
all:
|
all:
|
||||||
- userAgent.contains("TestBot")
|
- userAgent.contains("TestBot")
|
||||||
- path.matches("^/temp.*\\.html")
|
- path.matches("^/temp.*\\.html")
|
||||||
|
name: robots-txt-policy-disallow-14
|
||||||
|
- action: CHALLENGE
|
||||||
|
expression:
|
||||||
|
all:
|
||||||
|
- userAgent.contains("TestBot")
|
||||||
|
- path.matches("^/file.\\.log")
|
||||||
name: robots-txt-policy-disallow-15
|
name: robots-txt-policy-disallow-15
|
||||||
|
25
cmd/robots2policy/testdata/consecutive.robots.txt
vendored
Normal file
25
cmd/robots2policy/testdata/consecutive.robots.txt
vendored
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
# Test consecutive user agents that should be grouped into any: blocks
|
||||||
|
User-agent: *
|
||||||
|
Disallow: /admin
|
||||||
|
Crawl-delay: 10
|
||||||
|
|
||||||
|
# Multiple consecutive user agents - should be grouped
|
||||||
|
User-agent: BadBot
|
||||||
|
User-agent: SpamBot
|
||||||
|
User-agent: EvilBot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
# Single user agent - should be separate
|
||||||
|
User-agent: GoodBot
|
||||||
|
Disallow: /private
|
||||||
|
|
||||||
|
# Multiple consecutive user agents with crawl delay
|
||||||
|
User-agent: SlowBot1
|
||||||
|
User-agent: SlowBot2
|
||||||
|
Crawl-delay: 5
|
||||||
|
|
||||||
|
# Multiple consecutive user agents with specific path
|
||||||
|
User-agent: SearchBot1
|
||||||
|
User-agent: SearchBot2
|
||||||
|
User-agent: SearchBot3
|
||||||
|
Disallow: /search
|
47
cmd/robots2policy/testdata/consecutive.yaml
vendored
Normal file
47
cmd/robots2policy/testdata/consecutive.yaml
vendored
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
- action: WEIGH
|
||||||
|
expression: "true"
|
||||||
|
name: robots-txt-policy-crawl-delay-1
|
||||||
|
weight:
|
||||||
|
adjust: 3
|
||||||
|
- action: CHALLENGE
|
||||||
|
expression: path.startsWith("/admin")
|
||||||
|
name: robots-txt-policy-disallow-2
|
||||||
|
- action: DENY
|
||||||
|
expression:
|
||||||
|
any:
|
||||||
|
- userAgent.contains("BadBot")
|
||||||
|
- userAgent.contains("SpamBot")
|
||||||
|
- userAgent.contains("EvilBot")
|
||||||
|
name: robots-txt-policy-blacklist-3
|
||||||
|
- action: CHALLENGE
|
||||||
|
expression:
|
||||||
|
all:
|
||||||
|
- userAgent.contains("GoodBot")
|
||||||
|
- path.startsWith("/private")
|
||||||
|
name: robots-txt-policy-disallow-4
|
||||||
|
- action: WEIGH
|
||||||
|
expression:
|
||||||
|
any:
|
||||||
|
- userAgent.contains("SlowBot1")
|
||||||
|
- userAgent.contains("SlowBot2")
|
||||||
|
name: robots-txt-policy-crawl-delay-5
|
||||||
|
weight:
|
||||||
|
adjust: 3
|
||||||
|
- action: CHALLENGE
|
||||||
|
expression:
|
||||||
|
all:
|
||||||
|
- userAgent.contains("SearchBot1")
|
||||||
|
- path.startsWith("/search")
|
||||||
|
name: robots-txt-policy-disallow-7
|
||||||
|
- action: CHALLENGE
|
||||||
|
expression:
|
||||||
|
all:
|
||||||
|
- userAgent.contains("SearchBot2")
|
||||||
|
- path.startsWith("/search")
|
||||||
|
name: robots-txt-policy-disallow-8
|
||||||
|
- action: CHALLENGE
|
||||||
|
expression:
|
||||||
|
all:
|
||||||
|
- userAgent.contains("SearchBot3")
|
||||||
|
- path.startsWith("/search")
|
||||||
|
name: robots-txt-policy-disallow-9
|
6
cmd/robots2policy/testdata/wildcards.yaml
vendored
6
cmd/robots2policy/testdata/wildcards.yaml
vendored
@ -1,12 +1,12 @@
|
|||||||
- action: CHALLENGE
|
- action: CHALLENGE
|
||||||
expression: path.matches("^/.*/private")
|
expression: path.matches("^/search.*")
|
||||||
name: robots-txt-policy-disallow-1
|
name: robots-txt-policy-disallow-1
|
||||||
- action: CHALLENGE
|
- action: CHALLENGE
|
||||||
expression: path.matches("^/admin/.*.action=delete")
|
expression: path.matches("^/.*/private")
|
||||||
name: robots-txt-policy-disallow-2
|
name: robots-txt-policy-disallow-2
|
||||||
- action: CHALLENGE
|
- action: CHALLENGE
|
||||||
expression: path.matches("^/file.\\.txt")
|
expression: path.matches("^/file.\\.txt")
|
||||||
name: robots-txt-policy-disallow-3
|
name: robots-txt-policy-disallow-3
|
||||||
- action: CHALLENGE
|
- action: CHALLENGE
|
||||||
expression: path.matches("^/search.*")
|
expression: path.matches("^/admin/.*.action=delete")
|
||||||
name: robots-txt-policy-disallow-4
|
name: robots-txt-policy-disallow-4
|
||||||
|
Loading…
x
Reference in New Issue
Block a user