From e0781e456065c85654f9370e1de3ebbacccc0302 Mon Sep 17 00:00:00 2001 From: Jason Cameron Date: Sat, 14 Jun 2025 23:41:00 -0400 Subject: [PATCH] feat: add robots2policy CLI to convert robots.txt to Anubis CEL (#657) * feat: add robots2policy CLI utility to convert robots.txt to Anubis challenge policies * feat: add documentation for robots2policy CLI tool * feat: implement crawl delay handling as weight adjustment in Anubis rules * feat: add various robots.txt and YAML configurations for user agent handling and crawl delays * test: add comprehensive tests for robots2policy conversion and parsing * fix: update example URL in usage instructions for robots2policy CLI * Update metadata check-spelling run (pull_request) for json/robots2policycli Signed-off-by: check-spelling-bot on-behalf-of: @check-spelling * docs: add crawl delay weight adjustment and deny user agents option to robots2policy CLI * Update cmd/robots2policy/main.go Co-authored-by: Xe Iaso Signed-off-by: Jason Cameron * Update cmd/robots2policy/main.go Co-authored-by: Xe Iaso Signed-off-by: Jason Cameron * fix(robots2policy): use sigs.k8s.io/yaml Signed-off-by: Xe Iaso * feat(config): properly marshal bot policy rules Signed-off-by: Xe Iaso * chore(yeetfile): expose robots2policy in libexec Signed-off-by: Xe Iaso * fix(yeetfile): put robots2policy in $PATH Signed-off-by: Xe Iaso * Update metadata check-spelling run (pull_request) for json/robots2policycli Signed-off-by: check-spelling-bot on-behalf-of: @check-spelling * style: reorder imports * refactor: use preexisting structs in config * fix: correct flag check in main function * fix: reorder fields in AnubisRule struct for better alignment * style: improve alignment of struct fields in AnubisRule and OGTagCache * Update metadata check-spelling run (pull_request) for json/robots2policycli Signed-off-by: check-spelling-bot on-behalf-of: @check-spelling * fix: add validation for generated Anubis rules from robots.txt * feat: add batch processing for robots.txt files to generate Anubis CEL policies * fix: improve usage message and error handling for input file requirement * refactor: update AnubisRule structure to use ExpressionOrList for improved expression handling * refactor: reorganize policy definitions in YAML files for consistency and clarity * fix: correct indentation in blacklist and complex YAML files for consistency * test: enhance output comparison in robots2policy tests for YAML and JSON formats * Revert "fix: improve usage message and error handling for input file requirement" This reverts commit ddcde1f2a326545d3ef2ec32e5e03f55f4f931a8. * fix: improve usage message and error handling in robots2policy Signed-off-by: Jason Cameron --------- Signed-off-by: check-spelling-bot Signed-off-by: Jason Cameron Signed-off-by: Xe Iaso Signed-off-by: Jason Cameron Co-authored-by: Xe Iaso --- .github/actions/spelling/expect.txt | 4 + cmd/robots2policy/batch/batch_process.go | 78 ++++ cmd/robots2policy/main.go | 313 +++++++++++++ cmd/robots2policy/robots2policy_test.go | 418 ++++++++++++++++++ .../testdata/blacklist.robots.txt | 15 + cmd/robots2policy/testdata/blacklist.yaml | 30 ++ cmd/robots2policy/testdata/complex.robots.txt | 30 ++ cmd/robots2policy/testdata/complex.yaml | 71 +++ cmd/robots2policy/testdata/custom-name.yaml | 6 + cmd/robots2policy/testdata/deny-action.yaml | 6 + cmd/robots2policy/testdata/empty.robots.txt | 2 + cmd/robots2policy/testdata/empty.yaml | 1 + cmd/robots2policy/testdata/simple.json | 12 + cmd/robots2policy/testdata/simple.robots.txt | 5 + cmd/robots2policy/testdata/simple.yaml | 6 + .../testdata/wildcards.robots.txt | 6 + cmd/robots2policy/testdata/wildcards.yaml | 12 + docs/docs/CHANGELOG.md | 1 + docs/docs/admin/robots2policy.mdx | 84 ++++ go.mod | 4 +- internal/ogtags/mem_test.go | 3 +- internal/ogtags/ogtags.go | 12 +- internal/ogtags/ogtags_fuzz_test.go | 3 +- lib/policy/config/config.go | 24 +- lib/policy/config/expressionorlist.go | 43 +- lib/policy/config/expressionorlist_test.go | 137 +++++- lib/policy/config/weight.go | 2 +- yeetfile.js | 1 + 28 files changed, 1302 insertions(+), 27 deletions(-) create mode 100644 cmd/robots2policy/batch/batch_process.go create mode 100644 cmd/robots2policy/main.go create mode 100644 cmd/robots2policy/robots2policy_test.go create mode 100644 cmd/robots2policy/testdata/blacklist.robots.txt create mode 100644 cmd/robots2policy/testdata/blacklist.yaml create mode 100644 cmd/robots2policy/testdata/complex.robots.txt create mode 100644 cmd/robots2policy/testdata/complex.yaml create mode 100644 cmd/robots2policy/testdata/custom-name.yaml create mode 100644 cmd/robots2policy/testdata/deny-action.yaml create mode 100644 cmd/robots2policy/testdata/empty.robots.txt create mode 100644 cmd/robots2policy/testdata/empty.yaml create mode 100644 cmd/robots2policy/testdata/simple.json create mode 100644 cmd/robots2policy/testdata/simple.robots.txt create mode 100644 cmd/robots2policy/testdata/simple.yaml create mode 100644 cmd/robots2policy/testdata/wildcards.robots.txt create mode 100644 cmd/robots2policy/testdata/wildcards.yaml create mode 100644 docs/docs/admin/robots2policy.mdx diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt index 60d917f..92d584c 100644 --- a/.github/actions/spelling/expect.txt +++ b/.github/actions/spelling/expect.txt @@ -12,6 +12,7 @@ archlinux badregexes bdba berr +betteralign bingbot bitcoin blogging @@ -96,6 +97,7 @@ gomod goodbot googlebot govulncheck +goyaml GPG GPT gptbot @@ -162,6 +164,7 @@ mojeekbot mozilla nbf netsurf +NFlag nginx nobots NONINFRINGEMENT @@ -217,6 +220,7 @@ sebest secretplans selfsigned Semrush +Seo setsebool shellcheck Sidetrade diff --git a/cmd/robots2policy/batch/batch_process.go b/cmd/robots2policy/batch/batch_process.go new file mode 100644 index 0000000..b448bb3 --- /dev/null +++ b/cmd/robots2policy/batch/batch_process.go @@ -0,0 +1,78 @@ +/* +Batch process robots.txt files from archives like https://github.com/nrjones8/robots-dot-txt-archive-bot/tree/master/data/cleaned +into Anubis CEL policies. Usage: go run batch_process.go +*/ +package main + +import ( + "fmt" + "io/fs" + "log" + "os" + "os/exec" + "path/filepath" + "strings" +) + +func main() { + if len(os.Args) < 2 { + fmt.Println("Usage: go run batch_process.go ") + fmt.Println("Example: go run batch_process.go ./cleaned") + os.Exit(1) + } + + cleanedDir := os.Args[1] + outputDir := "generated_policies" + + // Create output directory + if err := os.MkdirAll(outputDir, 0755); err != nil { + log.Fatalf("Failed to create output directory: %v", err) + } + + count := 0 + err := filepath.WalkDir(cleanedDir, func(path string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + + // Skip directories + if d.IsDir() { + return nil + } + + // Generate policy name from file path + relPath, _ := filepath.Rel(cleanedDir, path) + policyName := strings.ReplaceAll(relPath, "/", "-") + policyName = strings.TrimSuffix(policyName, "-robots.txt") + policyName = strings.ReplaceAll(policyName, ".", "-") + + outputFile := filepath.Join(outputDir, policyName+".yaml") + + cmd := exec.Command("go", "run", "main.go", + "-input", path, + "-output", outputFile, + "-name", policyName, + "-format", "yaml") + + if err := cmd.Run(); err != nil { + fmt.Printf("Warning: Failed to process %s: %v\n", path, err) + return nil // Continue processing other files + } + + count++ + if count%100 == 0 { + fmt.Printf("Processed %d files...\n", count) + } else if count%10 == 0 { + fmt.Print(".") + } + + return nil + }) + + if err != nil { + log.Fatalf("Error walking directory: %v", err) + } + + fmt.Printf("Successfully processed %d robots.txt files\n", count) + fmt.Printf("Generated policies saved to: %s/\n", outputDir) +} diff --git a/cmd/robots2policy/main.go b/cmd/robots2policy/main.go new file mode 100644 index 0000000..eaa4d7f --- /dev/null +++ b/cmd/robots2policy/main.go @@ -0,0 +1,313 @@ +package main + +import ( + "bufio" + "encoding/json" + "flag" + "fmt" + "io" + "log" + "net/http" + "os" + "regexp" + "strings" + + "github.com/TecharoHQ/anubis/lib/policy/config" + + "sigs.k8s.io/yaml" +) + +var ( + inputFile = flag.String("input", "", "path to robots.txt file (use - for stdin)") + outputFile = flag.String("output", "", "output file path (use - for stdout, defaults to stdout)") + outputFormat = flag.String("format", "yaml", "output format: yaml or json") + baseAction = flag.String("action", "CHALLENGE", "default action for disallowed paths: ALLOW, DENY, CHALLENGE, WEIGH") + crawlDelay = flag.Int("crawl-delay-weight", 0, "if > 0, add weight adjustment for crawl-delay (difficulty adjustment)") + policyName = flag.String("name", "robots-txt-policy", "name for the generated policy") + userAgentDeny = flag.String("deny-user-agents", "DENY", "action for specifically blocked user agents: DENY, CHALLENGE") + helpFlag = flag.Bool("help", false, "show help") +) + +type RobotsRule struct { + UserAgent string + Disallows []string + Allows []string + CrawlDelay int + IsBlacklist bool // true if this is a specifically denied user agent +} + +type AnubisRule struct { + Expression *config.ExpressionOrList `yaml:"expression,omitempty" json:"expression,omitempty"` + Challenge *config.ChallengeRules `yaml:"challenge,omitempty" json:"challenge,omitempty"` + Weight *config.Weight `yaml:"weight,omitempty" json:"weight,omitempty"` + Name string `yaml:"name" json:"name"` + Action string `yaml:"action" json:"action"` +} + +func init() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage of %s:\n", os.Args[0]) + fmt.Fprintf(os.Stderr, "%s [options] -input \n\n", os.Args[0]) + flag.PrintDefaults() + fmt.Fprintln(os.Stderr, "\nExamples:") + fmt.Fprintln(os.Stderr, " # Convert local robots.txt file") + fmt.Fprintln(os.Stderr, " robots2policy -input robots.txt -output policy.yaml") + fmt.Fprintln(os.Stderr, "") + fmt.Fprintln(os.Stderr, " # Convert from URL") + fmt.Fprintln(os.Stderr, " robots2policy -input https://example.com/robots.txt -format json") + fmt.Fprintln(os.Stderr, "") + fmt.Fprintln(os.Stderr, " # Read from stdin, write to stdout") + fmt.Fprintln(os.Stderr, " curl https://example.com/robots.txt | robots2policy -input -") + os.Exit(2) + } +} + +func main() { + flag.Parse() + + if len(flag.Args()) > 0 || *helpFlag || *inputFile == "" { + flag.Usage() + } + + // Read robots.txt + var input io.Reader + if *inputFile == "-" { + input = os.Stdin + } else if strings.HasPrefix(*inputFile, "http://") || strings.HasPrefix(*inputFile, "https://") { + resp, err := http.Get(*inputFile) + if err != nil { + log.Fatalf("failed to fetch robots.txt from URL: %v", err) + } + defer resp.Body.Close() + input = resp.Body + } else { + file, err := os.Open(*inputFile) + if err != nil { + log.Fatalf("failed to open input file: %v", err) + } + defer file.Close() + input = file + } + + // Parse robots.txt + rules, err := parseRobotsTxt(input) + if err != nil { + log.Fatalf("failed to parse robots.txt: %v", err) + } + + // Convert to Anubis rules + anubisRules := convertToAnubisRules(rules) + + // Check if any rules were generated + if len(anubisRules) == 0 { + log.Fatal("no valid rules generated from robots.txt - file may be empty or contain no disallow directives") + } + + // Generate output + var output []byte + switch strings.ToLower(*outputFormat) { + case "yaml": + output, err = yaml.Marshal(anubisRules) + case "json": + output, err = json.MarshalIndent(anubisRules, "", " ") + default: + log.Fatalf("unsupported output format: %s (use yaml or json)", *outputFormat) + } + + if err != nil { + log.Fatalf("failed to marshal output: %v", err) + } + + // Write output + if *outputFile == "" || *outputFile == "-" { + fmt.Print(string(output)) + } else { + err = os.WriteFile(*outputFile, output, 0644) + if err != nil { + log.Fatalf("failed to write output file: %v", err) + } + fmt.Printf("Generated Anubis policy written to %s\n", *outputFile) + } +} + +func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) { + scanner := bufio.NewScanner(input) + var rules []RobotsRule + var currentRule *RobotsRule + + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + + // Skip empty lines and comments + if line == "" || strings.HasPrefix(line, "#") { + continue + } + + // Split on first colon + parts := strings.SplitN(line, ":", 2) + if len(parts) != 2 { + continue + } + + directive := strings.TrimSpace(strings.ToLower(parts[0])) + value := strings.TrimSpace(parts[1]) + + switch directive { + case "user-agent": + // Start a new rule section + if currentRule != nil { + rules = append(rules, *currentRule) + } + currentRule = &RobotsRule{ + UserAgent: value, + Disallows: make([]string, 0), + Allows: make([]string, 0), + } + + case "disallow": + if currentRule != nil && value != "" { + currentRule.Disallows = append(currentRule.Disallows, value) + } + + case "allow": + if currentRule != nil && value != "" { + currentRule.Allows = append(currentRule.Allows, value) + } + + case "crawl-delay": + if currentRule != nil { + if delay, err := parseIntSafe(value); err == nil { + currentRule.CrawlDelay = delay + } + } + } + } + + // Don't forget the last rule + if currentRule != nil { + rules = append(rules, *currentRule) + } + + // Mark blacklisted user agents (those with "Disallow: /") + for i := range rules { + for _, disallow := range rules[i].Disallows { + if disallow == "/" { + rules[i].IsBlacklist = true + break + } + } + } + + return rules, scanner.Err() +} + +func parseIntSafe(s string) (int, error) { + var result int + _, err := fmt.Sscanf(s, "%d", &result) + return result, err +} + +func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule { + var anubisRules []AnubisRule + ruleCounter := 0 + + for _, robotsRule := range robotsRules { + userAgent := robotsRule.UserAgent + + // Handle crawl delay as weight adjustment (do this first before any continues) + if robotsRule.CrawlDelay > 0 && *crawlDelay > 0 { + ruleCounter++ + rule := AnubisRule{ + Name: fmt.Sprintf("%s-crawl-delay-%d", *policyName, ruleCounter), + Action: "WEIGH", + Weight: &config.Weight{Adjust: *crawlDelay}, + } + + if userAgent == "*" { + rule.Expression = &config.ExpressionOrList{ + All: []string{"true"}, // Always applies + } + } else { + rule.Expression = &config.ExpressionOrList{ + All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)}, + } + } + + anubisRules = append(anubisRules, rule) + } + + // Handle blacklisted user agents (complete deny/challenge) + if robotsRule.IsBlacklist { + ruleCounter++ + rule := AnubisRule{ + Name: fmt.Sprintf("%s-blacklist-%d", *policyName, ruleCounter), + Action: *userAgentDeny, + } + + if userAgent == "*" { + // This would block everything - convert to a weight adjustment instead + rule.Name = fmt.Sprintf("%s-global-restriction-%d", *policyName, ruleCounter) + rule.Action = "WEIGH" + rule.Weight = &config.Weight{Adjust: 20} // Increase difficulty significantly + rule.Expression = &config.ExpressionOrList{ + All: []string{"true"}, // Always applies + } + } else { + rule.Expression = &config.ExpressionOrList{ + All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)}, + } + } + anubisRules = append(anubisRules, rule) + continue + } + + // Handle specific disallow rules + for _, disallow := range robotsRule.Disallows { + if disallow == "/" { + continue // Already handled as blacklist above + } + + ruleCounter++ + rule := AnubisRule{ + Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter), + Action: *baseAction, + } + + // Build CEL expression + var conditions []string + + // Add user agent condition if not wildcard + if userAgent != "*" { + conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgent)) + } + + // Add path condition + pathCondition := buildPathCondition(disallow) + conditions = append(conditions, pathCondition) + + rule.Expression = &config.ExpressionOrList{ + All: conditions, + } + + anubisRules = append(anubisRules, rule) + } + + } + + return anubisRules +} + +func buildPathCondition(robotsPath string) string { + // Handle wildcards in robots.txt paths + if strings.Contains(robotsPath, "*") || strings.Contains(robotsPath, "?") { + // Convert robots.txt wildcards to regex + regex := regexp.QuoteMeta(robotsPath) + regex = strings.ReplaceAll(regex, `\*`, `.*`) // * becomes .* + regex = strings.ReplaceAll(regex, `\?`, `.`) // ? becomes . + regex = "^" + regex + return fmt.Sprintf("path.matches(%q)", regex) + } + + // Simple prefix match for most cases + return fmt.Sprintf("path.startsWith(%q)", robotsPath) +} diff --git a/cmd/robots2policy/robots2policy_test.go b/cmd/robots2policy/robots2policy_test.go new file mode 100644 index 0000000..aa73f6b --- /dev/null +++ b/cmd/robots2policy/robots2policy_test.go @@ -0,0 +1,418 @@ +package main + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "reflect" + "strings" + "testing" + + "gopkg.in/yaml.v3" +) + +type TestCase struct { + name string + robotsFile string + expectedFile string + options TestOptions +} + +type TestOptions struct { + format string + action string + crawlDelayWeight int + policyName string + deniedAction string +} + +func TestDataFileConversion(t *testing.T) { + + testCases := []TestCase{ + { + name: "simple_default", + robotsFile: "simple.robots.txt", + expectedFile: "simple.yaml", + options: TestOptions{format: "yaml"}, + }, + { + name: "simple_json", + robotsFile: "simple.robots.txt", + expectedFile: "simple.json", + options: TestOptions{format: "json"}, + }, + { + name: "simple_deny_action", + robotsFile: "simple.robots.txt", + expectedFile: "deny-action.yaml", + options: TestOptions{format: "yaml", action: "DENY"}, + }, + { + name: "simple_custom_name", + robotsFile: "simple.robots.txt", + expectedFile: "custom-name.yaml", + options: TestOptions{format: "yaml", policyName: "my-custom-policy"}, + }, + { + name: "blacklist_with_crawl_delay", + robotsFile: "blacklist.robots.txt", + expectedFile: "blacklist.yaml", + options: TestOptions{format: "yaml", crawlDelayWeight: 3}, + }, + { + name: "wildcards", + robotsFile: "wildcards.robots.txt", + expectedFile: "wildcards.yaml", + options: TestOptions{format: "yaml"}, + }, + { + name: "empty_file", + robotsFile: "empty.robots.txt", + expectedFile: "empty.yaml", + options: TestOptions{format: "yaml"}, + }, + { + name: "complex_scenario", + robotsFile: "complex.robots.txt", + expectedFile: "complex.yaml", + options: TestOptions{format: "yaml", crawlDelayWeight: 5}, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + robotsPath := filepath.Join("testdata", tc.robotsFile) + expectedPath := filepath.Join("testdata", tc.expectedFile) + + // Read robots.txt input + robotsFile, err := os.Open(robotsPath) + if err != nil { + t.Fatalf("Failed to open robots file %s: %v", robotsPath, err) + } + defer robotsFile.Close() + + // Parse robots.txt + rules, err := parseRobotsTxt(robotsFile) + if err != nil { + t.Fatalf("Failed to parse robots.txt: %v", err) + } + + // Set test options + oldFormat := *outputFormat + oldAction := *baseAction + oldCrawlDelay := *crawlDelay + oldPolicyName := *policyName + oldDeniedAction := *userAgentDeny + + if tc.options.format != "" { + *outputFormat = tc.options.format + } + if tc.options.action != "" { + *baseAction = tc.options.action + } + if tc.options.crawlDelayWeight > 0 { + *crawlDelay = tc.options.crawlDelayWeight + } + if tc.options.policyName != "" { + *policyName = tc.options.policyName + } + if tc.options.deniedAction != "" { + *userAgentDeny = tc.options.deniedAction + } + + // Restore options after test + defer func() { + *outputFormat = oldFormat + *baseAction = oldAction + *crawlDelay = oldCrawlDelay + *policyName = oldPolicyName + *userAgentDeny = oldDeniedAction + }() + + // Convert to Anubis rules + anubisRules := convertToAnubisRules(rules) + + // Generate output + var actualOutput []byte + switch strings.ToLower(*outputFormat) { + case "yaml": + actualOutput, err = yaml.Marshal(anubisRules) + case "json": + actualOutput, err = json.MarshalIndent(anubisRules, "", " ") + } + if err != nil { + t.Fatalf("Failed to marshal output: %v", err) + } + + // Read expected output + expectedOutput, err := os.ReadFile(expectedPath) + if err != nil { + t.Fatalf("Failed to read expected file %s: %v", expectedPath, err) + } + + if strings.ToLower(*outputFormat) == "yaml" { + var actualData []interface{} + var expectedData []interface{} + + err = yaml.Unmarshal(actualOutput, &actualData) + if err != nil { + t.Fatalf("Failed to unmarshal actual output: %v", err) + } + + err = yaml.Unmarshal(expectedOutput, &expectedData) + if err != nil { + t.Fatalf("Failed to unmarshal expected output: %v", err) + } + + // Compare data structures + if !compareData(actualData, expectedData) { + actualStr := strings.TrimSpace(string(actualOutput)) + expectedStr := strings.TrimSpace(string(expectedOutput)) + t.Errorf("Output mismatch for %s\nExpected:\n%s\n\nActual:\n%s", tc.name, expectedStr, actualStr) + } + } else { + var actualData []interface{} + var expectedData []interface{} + + err = json.Unmarshal(actualOutput, &actualData) + if err != nil { + t.Fatalf("Failed to unmarshal actual JSON output: %v", err) + } + + err = json.Unmarshal(expectedOutput, &expectedData) + if err != nil { + t.Fatalf("Failed to unmarshal expected JSON output: %v", err) + } + + // Compare data structures + if !compareData(actualData, expectedData) { + actualStr := strings.TrimSpace(string(actualOutput)) + expectedStr := strings.TrimSpace(string(expectedOutput)) + t.Errorf("Output mismatch for %s\nExpected:\n%s\n\nActual:\n%s", tc.name, expectedStr, actualStr) + } + } + }) + } +} + +func TestCaseInsensitiveParsing(t *testing.T) { + robotsTxt := `User-Agent: * +Disallow: /admin +Crawl-Delay: 10 + +User-agent: TestBot +disallow: /test +crawl-delay: 5 + +USER-AGENT: UpperBot +DISALLOW: /upper +CRAWL-DELAY: 20` + + reader := strings.NewReader(robotsTxt) + rules, err := parseRobotsTxt(reader) + if err != nil { + t.Fatalf("Failed to parse case-insensitive robots.txt: %v", err) + } + + expectedRules := 3 + if len(rules) != expectedRules { + t.Errorf("Expected %d rules, got %d", expectedRules, len(rules)) + } + + // Check that all crawl delays were parsed + for i, rule := range rules { + expectedDelays := []int{10, 5, 20} + if rule.CrawlDelay != expectedDelays[i] { + t.Errorf("Rule %d: expected crawl delay %d, got %d", i, expectedDelays[i], rule.CrawlDelay) + } + } +} + +func TestVariousOutputFormats(t *testing.T) { + robotsTxt := `User-agent: * +Disallow: /admin` + + reader := strings.NewReader(robotsTxt) + rules, err := parseRobotsTxt(reader) + if err != nil { + t.Fatalf("Failed to parse robots.txt: %v", err) + } + + oldPolicyName := *policyName + *policyName = "test-policy" + defer func() { *policyName = oldPolicyName }() + + anubisRules := convertToAnubisRules(rules) + + // Test YAML output + yamlOutput, err := yaml.Marshal(anubisRules) + if err != nil { + t.Fatalf("Failed to marshal YAML: %v", err) + } + + if !strings.Contains(string(yamlOutput), "name: test-policy-disallow-1") { + t.Errorf("YAML output doesn't contain expected rule name") + } + + // Test JSON output + jsonOutput, err := json.MarshalIndent(anubisRules, "", " ") + if err != nil { + t.Fatalf("Failed to marshal JSON: %v", err) + } + + if !strings.Contains(string(jsonOutput), `"name": "test-policy-disallow-1"`) { + t.Errorf("JSON output doesn't contain expected rule name") + } +} + +func TestDifferentActions(t *testing.T) { + robotsTxt := `User-agent: * +Disallow: /admin` + + testActions := []string{"ALLOW", "DENY", "CHALLENGE", "WEIGH"} + + for _, action := range testActions { + t.Run("action_"+action, func(t *testing.T) { + reader := strings.NewReader(robotsTxt) + rules, err := parseRobotsTxt(reader) + if err != nil { + t.Fatalf("Failed to parse robots.txt: %v", err) + } + + oldAction := *baseAction + *baseAction = action + defer func() { *baseAction = oldAction }() + + anubisRules := convertToAnubisRules(rules) + + if len(anubisRules) != 1 { + t.Fatalf("Expected 1 rule, got %d", len(anubisRules)) + } + + if anubisRules[0].Action != action { + t.Errorf("Expected action %s, got %s", action, anubisRules[0].Action) + } + }) + } +} + +func TestPolicyNaming(t *testing.T) { + robotsTxt := `User-agent: * +Disallow: /admin +Disallow: /private + +User-agent: BadBot +Disallow: /` + + testNames := []string{"custom-policy", "my-rules", "site-protection"} + + for _, name := range testNames { + t.Run("name_"+name, func(t *testing.T) { + reader := strings.NewReader(robotsTxt) + rules, err := parseRobotsTxt(reader) + if err != nil { + t.Fatalf("Failed to parse robots.txt: %v", err) + } + + oldName := *policyName + *policyName = name + defer func() { *policyName = oldName }() + + anubisRules := convertToAnubisRules(rules) + + // Check that all rule names use the custom prefix + for _, rule := range anubisRules { + if !strings.HasPrefix(rule.Name, name+"-") { + t.Errorf("Rule name %s doesn't start with expected prefix %s-", rule.Name, name) + } + } + }) + } +} + +func TestCrawlDelayWeights(t *testing.T) { + robotsTxt := `User-agent: * +Disallow: /admin +Crawl-delay: 10 + +User-agent: SlowBot +Disallow: /slow +Crawl-delay: 60` + + testWeights := []int{1, 5, 10, 25} + + for _, weight := range testWeights { + t.Run(fmt.Sprintf("weight_%d", weight), func(t *testing.T) { + reader := strings.NewReader(robotsTxt) + rules, err := parseRobotsTxt(reader) + if err != nil { + t.Fatalf("Failed to parse robots.txt: %v", err) + } + + oldWeight := *crawlDelay + *crawlDelay = weight + defer func() { *crawlDelay = oldWeight }() + + anubisRules := convertToAnubisRules(rules) + + // Count weight rules and verify they have correct weight + weightRules := 0 + for _, rule := range anubisRules { + if rule.Action == "WEIGH" && rule.Weight != nil { + weightRules++ + if rule.Weight.Adjust != weight { + t.Errorf("Expected weight %d, got %d", weight, rule.Weight.Adjust) + } + } + } + + expectedWeightRules := 2 // One for *, one for SlowBot + if weightRules != expectedWeightRules { + t.Errorf("Expected %d weight rules, got %d", expectedWeightRules, weightRules) + } + }) + } +} + +func TestBlacklistActions(t *testing.T) { + robotsTxt := `User-agent: BadBot +Disallow: / + +User-agent: SpamBot +Disallow: /` + + testActions := []string{"DENY", "CHALLENGE"} + + for _, action := range testActions { + t.Run("blacklist_"+action, func(t *testing.T) { + reader := strings.NewReader(robotsTxt) + rules, err := parseRobotsTxt(reader) + if err != nil { + t.Fatalf("Failed to parse robots.txt: %v", err) + } + + oldAction := *userAgentDeny + *userAgentDeny = action + defer func() { *userAgentDeny = oldAction }() + + anubisRules := convertToAnubisRules(rules) + + // All rules should be blacklist rules with the specified action + for _, rule := range anubisRules { + if !strings.Contains(rule.Name, "blacklist") { + t.Errorf("Expected blacklist rule, got %s", rule.Name) + } + if rule.Action != action { + t.Errorf("Expected action %s, got %s", action, rule.Action) + } + } + }) + } +} + +// compareData performs a deep comparison of two data structures, +// ignoring differences that are semantically equivalent in YAML/JSON +func compareData(actual, expected interface{}) bool { + return reflect.DeepEqual(actual, expected) +} diff --git a/cmd/robots2policy/testdata/blacklist.robots.txt b/cmd/robots2policy/testdata/blacklist.robots.txt new file mode 100644 index 0000000..829bbda --- /dev/null +++ b/cmd/robots2policy/testdata/blacklist.robots.txt @@ -0,0 +1,15 @@ +# Test with blacklisted user agents +User-agent: * +Disallow: /admin +Crawl-delay: 10 + +User-agent: BadBot +Disallow: / + +User-agent: SpamBot +Disallow: / +Crawl-delay: 60 + +User-agent: Googlebot +Disallow: /search +Crawl-delay: 5 \ No newline at end of file diff --git a/cmd/robots2policy/testdata/blacklist.yaml b/cmd/robots2policy/testdata/blacklist.yaml new file mode 100644 index 0000000..b22f06f --- /dev/null +++ b/cmd/robots2policy/testdata/blacklist.yaml @@ -0,0 +1,30 @@ +- action: WEIGH + expression: "true" + name: robots-txt-policy-crawl-delay-1 + weight: + adjust: 3 +- action: CHALLENGE + expression: path.startsWith("/admin") + name: robots-txt-policy-disallow-2 +- action: DENY + expression: userAgent.contains("BadBot") + name: robots-txt-policy-blacklist-3 +- action: WEIGH + expression: userAgent.contains("SpamBot") + name: robots-txt-policy-crawl-delay-4 + weight: + adjust: 3 +- action: DENY + expression: userAgent.contains("SpamBot") + name: robots-txt-policy-blacklist-5 +- action: WEIGH + expression: userAgent.contains("Googlebot") + name: robots-txt-policy-crawl-delay-6 + weight: + adjust: 3 +- action: CHALLENGE + expression: + all: + - userAgent.contains("Googlebot") + - path.startsWith("/search") + name: robots-txt-policy-disallow-7 \ No newline at end of file diff --git a/cmd/robots2policy/testdata/complex.robots.txt b/cmd/robots2policy/testdata/complex.robots.txt new file mode 100644 index 0000000..a44c536 --- /dev/null +++ b/cmd/robots2policy/testdata/complex.robots.txt @@ -0,0 +1,30 @@ +# Complex real-world example +User-agent: * +Disallow: /admin/ +Disallow: /private/ +Disallow: /api/internal/ +Allow: /api/public/ +Crawl-delay: 5 + +User-agent: Googlebot +Disallow: /search/ +Allow: /api/ +Crawl-delay: 2 + +User-agent: Bingbot +Disallow: /search/ +Disallow: /admin/ +Crawl-delay: 10 + +User-agent: BadBot +Disallow: / + +User-agent: SeoBot +Disallow: / +Crawl-delay: 300 + +# Test with various patterns +User-agent: TestBot +Disallow: /*/admin +Disallow: /temp*.html +Disallow: /file?.log \ No newline at end of file diff --git a/cmd/robots2policy/testdata/complex.yaml b/cmd/robots2policy/testdata/complex.yaml new file mode 100644 index 0000000..2eb0d19 --- /dev/null +++ b/cmd/robots2policy/testdata/complex.yaml @@ -0,0 +1,71 @@ +- action: WEIGH + expression: "true" + name: robots-txt-policy-crawl-delay-1 + weight: + adjust: 5 +- action: CHALLENGE + expression: path.startsWith("/admin/") + name: robots-txt-policy-disallow-2 +- action: CHALLENGE + expression: path.startsWith("/private/") + name: robots-txt-policy-disallow-3 +- action: CHALLENGE + expression: path.startsWith("/api/internal/") + name: robots-txt-policy-disallow-4 +- action: WEIGH + expression: userAgent.contains("Googlebot") + name: robots-txt-policy-crawl-delay-5 + weight: + adjust: 5 +- action: CHALLENGE + expression: + all: + - userAgent.contains("Googlebot") + - path.startsWith("/search/") + name: robots-txt-policy-disallow-6 +- action: WEIGH + expression: userAgent.contains("Bingbot") + name: robots-txt-policy-crawl-delay-7 + weight: + adjust: 5 +- action: CHALLENGE + expression: + all: + - userAgent.contains("Bingbot") + - path.startsWith("/search/") + name: robots-txt-policy-disallow-8 +- action: CHALLENGE + expression: + all: + - userAgent.contains("Bingbot") + - path.startsWith("/admin/") + name: robots-txt-policy-disallow-9 +- action: DENY + expression: userAgent.contains("BadBot") + name: robots-txt-policy-blacklist-10 +- action: WEIGH + expression: userAgent.contains("SeoBot") + name: robots-txt-policy-crawl-delay-11 + weight: + adjust: 5 +- action: DENY + expression: userAgent.contains("SeoBot") + name: robots-txt-policy-blacklist-12 +- action: CHALLENGE + expression: + all: + - userAgent.contains("TestBot") + - path.matches("^/.*/admin") + name: robots-txt-policy-disallow-13 +- action: CHALLENGE + expression: + all: + - userAgent.contains("TestBot") + - path.matches("^/temp.*\\.html") + name: robots-txt-policy-disallow-14 +- action: CHALLENGE + expression: + all: + - userAgent.contains("TestBot") + - path.matches("^/file.\\.log") + name: robots-txt-policy-disallow-15 diff --git a/cmd/robots2policy/testdata/custom-name.yaml b/cmd/robots2policy/testdata/custom-name.yaml new file mode 100644 index 0000000..a299285 --- /dev/null +++ b/cmd/robots2policy/testdata/custom-name.yaml @@ -0,0 +1,6 @@ +- action: CHALLENGE + expression: path.startsWith("/admin/") + name: my-custom-policy-disallow-1 +- action: CHALLENGE + expression: path.startsWith("/private") + name: my-custom-policy-disallow-2 diff --git a/cmd/robots2policy/testdata/deny-action.yaml b/cmd/robots2policy/testdata/deny-action.yaml new file mode 100644 index 0000000..0b16e18 --- /dev/null +++ b/cmd/robots2policy/testdata/deny-action.yaml @@ -0,0 +1,6 @@ +- action: DENY + expression: path.startsWith("/admin/") + name: robots-txt-policy-disallow-1 +- action: DENY + expression: path.startsWith("/private") + name: robots-txt-policy-disallow-2 diff --git a/cmd/robots2policy/testdata/empty.robots.txt b/cmd/robots2policy/testdata/empty.robots.txt new file mode 100644 index 0000000..090fee6 --- /dev/null +++ b/cmd/robots2policy/testdata/empty.robots.txt @@ -0,0 +1,2 @@ +# Empty robots.txt (comments only) +# No actual rules \ No newline at end of file diff --git a/cmd/robots2policy/testdata/empty.yaml b/cmd/robots2policy/testdata/empty.yaml new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/cmd/robots2policy/testdata/empty.yaml @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/cmd/robots2policy/testdata/simple.json b/cmd/robots2policy/testdata/simple.json new file mode 100644 index 0000000..20bdf0d --- /dev/null +++ b/cmd/robots2policy/testdata/simple.json @@ -0,0 +1,12 @@ +[ + { + "action": "CHALLENGE", + "expression": "path.startsWith(\"/admin/\")", + "name": "robots-txt-policy-disallow-1" + }, + { + "action": "CHALLENGE", + "expression": "path.startsWith(\"/private\")", + "name": "robots-txt-policy-disallow-2" + } +] \ No newline at end of file diff --git a/cmd/robots2policy/testdata/simple.robots.txt b/cmd/robots2policy/testdata/simple.robots.txt new file mode 100644 index 0000000..44a8bd4 --- /dev/null +++ b/cmd/robots2policy/testdata/simple.robots.txt @@ -0,0 +1,5 @@ +# Simple robots.txt test +User-agent: * +Disallow: /admin/ +Disallow: /private +Allow: /public \ No newline at end of file diff --git a/cmd/robots2policy/testdata/simple.yaml b/cmd/robots2policy/testdata/simple.yaml new file mode 100644 index 0000000..0054056 --- /dev/null +++ b/cmd/robots2policy/testdata/simple.yaml @@ -0,0 +1,6 @@ +- action: CHALLENGE + expression: path.startsWith("/admin/") + name: robots-txt-policy-disallow-1 +- action: CHALLENGE + expression: path.startsWith("/private") + name: robots-txt-policy-disallow-2 diff --git a/cmd/robots2policy/testdata/wildcards.robots.txt b/cmd/robots2policy/testdata/wildcards.robots.txt new file mode 100644 index 0000000..f6c7746 --- /dev/null +++ b/cmd/robots2policy/testdata/wildcards.robots.txt @@ -0,0 +1,6 @@ +# Test wildcard patterns +User-agent: * +Disallow: /search* +Disallow: /*/private +Disallow: /file?.txt +Disallow: /admin/*?action=delete \ No newline at end of file diff --git a/cmd/robots2policy/testdata/wildcards.yaml b/cmd/robots2policy/testdata/wildcards.yaml new file mode 100644 index 0000000..ff51578 --- /dev/null +++ b/cmd/robots2policy/testdata/wildcards.yaml @@ -0,0 +1,12 @@ +- action: CHALLENGE + expression: path.matches("^/search.*") + name: robots-txt-policy-disallow-1 +- action: CHALLENGE + expression: path.matches("^/.*/private") + name: robots-txt-policy-disallow-2 +- action: CHALLENGE + expression: path.matches("^/file.\\.txt") + name: robots-txt-policy-disallow-3 +- action: CHALLENGE + expression: path.matches("^/admin/.*.action=delete") + name: robots-txt-policy-disallow-4 diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md index 8147866..b029cb9 100644 --- a/docs/docs/CHANGELOG.md +++ b/docs/docs/CHANGELOG.md @@ -22,6 +22,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Make progress bar styling more compatible (UXP, etc) - Optimized the OGTags subsystem with reduced allocations and runtime per request by up to 66% - Add `--strip-base-prefix` flag/envvar to strip the base prefix from request paths when forwarding to target servers +- Add `robots2policy` CLI utility to convert robots.txt files to Anubis challenge policies using CEL expressions ([#409](https://github.com/TecharoHQ/anubis/issues/409)) ## v1.19.1: Jenomis cen Lexentale - Echo 1 diff --git a/docs/docs/admin/robots2policy.mdx b/docs/docs/admin/robots2policy.mdx new file mode 100644 index 0000000..30f0eab --- /dev/null +++ b/docs/docs/admin/robots2policy.mdx @@ -0,0 +1,84 @@ +--- +title: robots2policy CLI Tool +sidebar_position: 50 +--- + +The `robots2policy` tool converts robots.txt files into Anubis challenge policies. It reads robots.txt rules and generates equivalent CEL expressions for path matching and user-agent filtering. + +## Installation + +Install directly with Go: + +```bash +go install github.com/TecharoHQ/anubis/cmd/robots2policy@latest +``` +## Usage + +Basic conversion from URL: + +```bash +robots2policy -input https://www.example.com/robots.txt +``` + +Convert local file to YAML: + +```bash +robots2policy -input robots.txt -output policy.yaml +``` + +Convert with custom settings: + +```bash +robots2policy -input robots.txt -action DENY -format json +``` + +## Options + +| Flag | Description | Default | +|-----------------------|--------------------------------------------------------------------|---------------------| +| `-input` | robots.txt file path or URL (use `-` for stdin) | *required* | +| `-output` | Output file (use `-` for stdout) | stdout | +| `-format` | Output format: `yaml` or `json` | `yaml` | +| `-action` | Action for disallowed paths: `ALLOW`, `DENY`, `CHALLENGE`, `WEIGH` | `CHALLENGE` | +| `-name` | Policy name prefix | `robots-txt-policy` | +| `-crawl-delay-weight` | Weight adjustment for crawl-delay rules | `3` | +| `-deny-user-agents` | Action for blacklisted user agents | `DENY` | + +## Example + +Input robots.txt: +```txt +User-agent: * +Disallow: /admin/ +Disallow: /private + +User-agent: BadBot +Disallow: / +``` + +Generated policy: +```yaml +- name: robots-txt-policy-disallow-1 + action: CHALLENGE + expression: + single: path.startsWith("/admin/") +- name: robots-txt-policy-disallow-2 + action: CHALLENGE + expression: + single: path.startsWith("/private") +- name: robots-txt-policy-blacklist-3 + action: DENY + expression: + single: userAgent.contains("BadBot") +``` + +## Using the Generated Policy + +Save the output and import it in your main policy file: + +```yaml +import: + - path: "./robots-policy.yaml" +``` + +The tool handles wildcard patterns, user-agent specific rules, and blacklisted bots automatically. diff --git a/go.mod b/go.mod index 0cf1be6..4ec185f 100644 --- a/go.mod +++ b/go.mod @@ -12,7 +12,9 @@ require ( github.com/sebest/xff v0.0.0-20210106013422-671bd2870b3a github.com/yl2chen/cidranger v1.0.2 golang.org/x/net v0.41.0 + gopkg.in/yaml.v3 v3.0.1 k8s.io/apimachinery v0.33.1 + sigs.k8s.io/yaml v1.4.0 ) require ( @@ -104,11 +106,9 @@ require ( google.golang.org/genproto/googleapis/rpc v0.0.0-20240826202546-f6391c0de4c7 // indirect google.golang.org/protobuf v1.36.5 // indirect gopkg.in/warnings.v0 v0.1.2 // indirect - gopkg.in/yaml.v3 v3.0.1 // indirect honnef.co/go/tools v0.6.1 // indirect mvdan.cc/sh/v3 v3.11.0 // indirect sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect - sigs.k8s.io/yaml v1.4.0 // indirect ) tool ( diff --git a/internal/ogtags/mem_test.go b/internal/ogtags/mem_test.go index 9db4d68..c9482cc 100644 --- a/internal/ogtags/mem_test.go +++ b/internal/ogtags/mem_test.go @@ -1,11 +1,12 @@ package ogtags import ( - "golang.org/x/net/html" "net/url" "runtime" "strings" "testing" + + "golang.org/x/net/html" ) func BenchmarkGetTarget(b *testing.B) { diff --git a/internal/ogtags/ogtags.go b/internal/ogtags/ogtags.go index 3779c06..7b7c9d9 100644 --- a/internal/ogtags/ogtags.go +++ b/internal/ogtags/ogtags.go @@ -21,17 +21,17 @@ const ( ) type OGTagCache struct { - cache *decaymap.Impl[string, map[string]string] - targetURL *url.URL - client *http.Client + cache *decaymap.Impl[string, map[string]string] + targetURL *url.URL + client *http.Client + + // Pre-built strings for optimization + unixPrefix string // "http://unix" approvedTags []string approvedPrefixes []string ogTimeToLive time.Duration ogCacheConsiderHost bool ogPassthrough bool - - // Pre-built strings for optimization - unixPrefix string // "http://unix" } func NewOGTagCache(target string, ogPassthrough bool, ogTimeToLive time.Duration, ogTagsConsiderHost bool) *OGTagCache { diff --git a/internal/ogtags/ogtags_fuzz_test.go b/internal/ogtags/ogtags_fuzz_test.go index c6e40fa..752aa2f 100644 --- a/internal/ogtags/ogtags_fuzz_test.go +++ b/internal/ogtags/ogtags_fuzz_test.go @@ -1,11 +1,12 @@ package ogtags import ( - "golang.org/x/net/html" "net/url" "strings" "testing" "unicode/utf8" + + "golang.org/x/net/html" ) // FuzzGetTarget tests getTarget with various inputs diff --git a/lib/policy/config/config.go b/lib/policy/config/config.go index d140549..5e2a96b 100644 --- a/lib/policy/config/config.go +++ b/lib/policy/config/config.go @@ -46,15 +46,15 @@ const ( const DefaultAlgorithm = "fast" type BotConfig struct { - UserAgentRegex *string `json:"user_agent_regex,omitempty"` - PathRegex *string `json:"path_regex,omitempty"` - HeadersRegex map[string]string `json:"headers_regex,omitempty"` - Expression *ExpressionOrList `json:"expression,omitempty"` - Challenge *ChallengeRules `json:"challenge,omitempty"` - Weight *Weight `json:"weight,omitempty"` - Name string `json:"name"` - Action Rule `json:"action"` - RemoteAddr []string `json:"remote_addresses,omitempty"` + UserAgentRegex *string `json:"user_agent_regex,omitempty" yaml:"user_agent_regex,omitempty"` + PathRegex *string `json:"path_regex,omitempty" yaml:"path_regex,omitempty"` + HeadersRegex map[string]string `json:"headers_regex,omitempty" yaml:"headers_regex,omitempty"` + Expression *ExpressionOrList `json:"expression,omitempty" yaml:"expression,omitempty"` + Challenge *ChallengeRules `json:"challenge,omitempty" yaml:"challenge,omitempty"` + Weight *Weight `json:"weight,omitempty" yaml:"weight,omitempty"` + Name string `json:"name" yaml:"name"` + Action Rule `json:"action" yaml:"action"` + RemoteAddr []string `json:"remote_addresses,omitempty" yaml:"remote_addresses,omitempty"` } func (b BotConfig) Zero() bool { @@ -170,9 +170,9 @@ func (b *BotConfig) Valid() error { } type ChallengeRules struct { - Algorithm string `json:"algorithm"` - Difficulty int `json:"difficulty"` - ReportAs int `json:"report_as"` + Algorithm string `json:"algorithm,omitempty" yaml:"algorithm,omitempty"` + Difficulty int `json:"difficulty,omitempty" yaml:"difficulty,omitempty"` + ReportAs int `json:"report_as,omitempty" yaml:"report_as,omitempty"` } var ( diff --git a/lib/policy/config/expressionorlist.go b/lib/policy/config/expressionorlist.go index 8851c5b..68dafd2 100644 --- a/lib/policy/config/expressionorlist.go +++ b/lib/policy/config/expressionorlist.go @@ -13,9 +13,9 @@ var ( ) type ExpressionOrList struct { - Expression string `json:"-"` - All []string `json:"all,omitempty"` - Any []string `json:"any,omitempty"` + Expression string `json:"-" yaml:"-"` + All []string `json:"all,omitempty" yaml:"all,omitempty"` + Any []string `json:"any,omitempty" yaml:"any,omitempty"` } func (eol ExpressionOrList) Equal(rhs *ExpressionOrList) bool { @@ -34,6 +34,43 @@ func (eol ExpressionOrList) Equal(rhs *ExpressionOrList) bool { return true } +func (eol *ExpressionOrList) MarshalYAML() (any, error) { + switch { + case len(eol.All) == 1 && len(eol.Any) == 0: + eol.Expression = eol.All[0] + eol.All = nil + case len(eol.Any) == 1 && len(eol.All) == 0: + eol.Expression = eol.Any[0] + eol.Any = nil + } + + if eol.Expression != "" { + return eol.Expression, nil + } + + type RawExpressionOrList ExpressionOrList + return RawExpressionOrList(*eol), nil +} + +func (eol *ExpressionOrList) MarshalJSON() ([]byte, error) { + switch { + case len(eol.All) == 1 && len(eol.Any) == 0: + eol.Expression = eol.All[0] + eol.All = nil + case len(eol.Any) == 1 && len(eol.All) == 0: + eol.Expression = eol.Any[0] + eol.Any = nil + } + + if eol.Expression != "" { + return json.Marshal(string(eol.Expression)) + } + + type RawExpressionOrList ExpressionOrList + val := RawExpressionOrList(*eol) + return json.Marshal(val) +} + func (eol *ExpressionOrList) UnmarshalJSON(data []byte) error { switch string(data[0]) { case `"`: // string diff --git a/lib/policy/config/expressionorlist_test.go b/lib/policy/config/expressionorlist_test.go index dbdda2d..8d0c843 100644 --- a/lib/policy/config/expressionorlist_test.go +++ b/lib/policy/config/expressionorlist_test.go @@ -1,12 +1,147 @@ package config import ( + "bytes" "encoding/json" "errors" "testing" + + yaml "sigs.k8s.io/yaml/goyaml.v3" ) -func TestExpressionOrListUnmarshal(t *testing.T) { +func TestExpressionOrListMarshalJSON(t *testing.T) { + for _, tt := range []struct { + name string + input *ExpressionOrList + output []byte + err error + }{ + { + name: "single expression", + input: &ExpressionOrList{ + Expression: "true", + }, + output: []byte(`"true"`), + err: nil, + }, + { + name: "all", + input: &ExpressionOrList{ + All: []string{"true", "true"}, + }, + output: []byte(`{"all":["true","true"]}`), + err: nil, + }, + { + name: "all one", + input: &ExpressionOrList{ + All: []string{"true"}, + }, + output: []byte(`"true"`), + err: nil, + }, + { + name: "any", + input: &ExpressionOrList{ + Any: []string{"true", "false"}, + }, + output: []byte(`{"any":["true","false"]}`), + err: nil, + }, + { + name: "any one", + input: &ExpressionOrList{ + Any: []string{"true"}, + }, + output: []byte(`"true"`), + err: nil, + }, + } { + t.Run(tt.name, func(t *testing.T) { + result, err := json.Marshal(tt.input) + if !errors.Is(err, tt.err) { + t.Errorf("wanted marshal error: %v but got: %v", tt.err, err) + } + + if !bytes.Equal(result, tt.output) { + t.Logf("wanted: %s", string(tt.output)) + t.Logf("got: %s", string(result)) + t.Error("mismatched output") + } + }) + } +} + +func TestExpressionOrListMarshalYAML(t *testing.T) { + for _, tt := range []struct { + name string + input *ExpressionOrList + output []byte + err error + }{ + { + name: "single expression", + input: &ExpressionOrList{ + Expression: "true", + }, + output: []byte(`"true"`), + err: nil, + }, + { + name: "all", + input: &ExpressionOrList{ + All: []string{"true", "true"}, + }, + output: []byte(`all: + - "true" + - "true"`), + err: nil, + }, + { + name: "all one", + input: &ExpressionOrList{ + All: []string{"true"}, + }, + output: []byte(`"true"`), + err: nil, + }, + { + name: "any", + input: &ExpressionOrList{ + Any: []string{"true", "false"}, + }, + output: []byte(`any: + - "true" + - "false"`), + err: nil, + }, + { + name: "any one", + input: &ExpressionOrList{ + Any: []string{"true"}, + }, + output: []byte(`"true"`), + err: nil, + }, + } { + t.Run(tt.name, func(t *testing.T) { + result, err := yaml.Marshal(tt.input) + if !errors.Is(err, tt.err) { + t.Errorf("wanted marshal error: %v but got: %v", tt.err, err) + } + + result = bytes.TrimSpace(result) + + if !bytes.Equal(result, tt.output) { + t.Logf("wanted: %q", string(tt.output)) + t.Logf("got: %q", string(result)) + t.Error("mismatched output") + } + }) + } +} + +func TestExpressionOrListUnmarshalJSON(t *testing.T) { for _, tt := range []struct { err error validErr error diff --git a/lib/policy/config/weight.go b/lib/policy/config/weight.go index f408111..aa8348f 100644 --- a/lib/policy/config/weight.go +++ b/lib/policy/config/weight.go @@ -1,5 +1,5 @@ package config type Weight struct { - Adjust int `json:"adjust"` + Adjust int `json:"adjust" yaml:"adjust"` } diff --git a/yeetfile.js b/yeetfile.js index 6f806f2..6992b1b 100644 --- a/yeetfile.js +++ b/yeetfile.js @@ -22,6 +22,7 @@ $`npm run assets`; build: ({ bin, etc, systemd, doc }) => { $`go build -o ${bin}/anubis -ldflags '-s -w -extldflags "-static" -X "github.com/TecharoHQ/anubis.Version=${git.tag()}"' ./cmd/anubis`; + $`go build -o ${bin}/anubis-robots2policy -ldflags '-s -w -extldflags "-static" -X "github.com/TecharoHQ/anubis.Version=${git.tag()}"' ./cmd/robots2policy`; file.install("./run/anubis@.service", `${systemd}/anubis@.service`); file.install("./run/default.env", `${etc}/default.env`);