anubis/lib/policy/config/config.go
Jason Cameron e0781e4560
feat: add robots2policy CLI to convert robots.txt to Anubis CEL (#657)
* feat: add robots2policy CLI utility to convert robots.txt to Anubis challenge policies

* feat: add documentation for robots2policy CLI tool

* feat: implement crawl delay handling as weight adjustment in Anubis rules

* feat: add various robots.txt and YAML configurations for user agent handling and crawl delays

* test: add comprehensive tests for robots2policy conversion and parsing

* fix: update example URL in usage instructions for robots2policy CLI

* Update metadata

check-spelling run (pull_request) for json/robots2policycli

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
on-behalf-of: @check-spelling <check-spelling-bot@check-spelling.dev>

* docs: add crawl delay weight adjustment and deny user agents option to robots2policy CLI

* Update cmd/robots2policy/main.go

Co-authored-by: Xe Iaso <me@xeiaso.net>
Signed-off-by: Jason Cameron <jasoncameron.all@gmail.com>

* Update cmd/robots2policy/main.go

Co-authored-by: Xe Iaso <me@xeiaso.net>
Signed-off-by: Jason Cameron <jasoncameron.all@gmail.com>

* fix(robots2policy): use sigs.k8s.io/yaml

Signed-off-by: Xe Iaso <me@xeiaso.net>

* feat(config): properly marshal bot policy rules

Signed-off-by: Xe Iaso <me@xeiaso.net>

* chore(yeetfile): expose robots2policy in libexec

Signed-off-by: Xe Iaso <me@xeiaso.net>

* fix(yeetfile): put robots2policy in $PATH

Signed-off-by: Xe Iaso <me@xeiaso.net>

* Update metadata

check-spelling run (pull_request) for json/robots2policycli

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
on-behalf-of: @check-spelling <check-spelling-bot@check-spelling.dev>

* style: reorder imports

* refactor: use preexisting structs in config

* fix: correct flag check in main function

* fix: reorder fields in AnubisRule struct for better alignment

* style: improve alignment of struct fields in AnubisRule and OGTagCache

* Update metadata

check-spelling run (pull_request) for json/robots2policycli

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
on-behalf-of: @check-spelling <check-spelling-bot@check-spelling.dev>

* fix: add validation for generated Anubis rules from robots.txt

* feat: add batch processing for robots.txt files to generate Anubis CEL policies

* fix: improve usage message and error handling for input file requirement

* refactor: update AnubisRule structure to use ExpressionOrList for improved expression handling

* refactor: reorganize policy definitions in YAML files for consistency and clarity

* fix: correct indentation in blacklist and complex YAML files for consistency

* test: enhance output comparison in robots2policy tests for YAML and JSON formats

* Revert "fix: improve usage message and error handling for input file requirement"

This reverts commit ddcde1f2a326545d3ef2ec32e5e03f55f4f931a8.

* fix: improve usage message and error handling in robots2policy

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

---------

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
Signed-off-by: Jason Cameron <jasoncameron.all@gmail.com>
Signed-off-by: Xe Iaso <me@xeiaso.net>
Signed-off-by: Jason Cameron <git@jasoncameron.dev>
Co-authored-by: Xe Iaso <me@xeiaso.net>
2025-06-14 23:41:00 -04:00

406 lines
10 KiB
Go

package config
import (
"errors"
"fmt"
"io"
"io/fs"
"net"
"net/http"
"os"
"regexp"
"strings"
"github.com/TecharoHQ/anubis/data"
"k8s.io/apimachinery/pkg/util/yaml"
)
var (
ErrNoBotRulesDefined = errors.New("config: must define at least one (1) bot rule")
ErrBotMustHaveName = errors.New("config.Bot: must set name")
ErrBotMustHaveUserAgentOrPath = errors.New("config.Bot: must set either user_agent_regex, path_regex, headers_regex, or remote_addresses")
ErrBotMustHaveUserAgentOrPathNotBoth = errors.New("config.Bot: must set either user_agent_regex, path_regex, and not both")
ErrUnknownAction = errors.New("config.Bot: unknown action")
ErrInvalidUserAgentRegex = errors.New("config.Bot: invalid user agent regex")
ErrInvalidPathRegex = errors.New("config.Bot: invalid path regex")
ErrInvalidHeadersRegex = errors.New("config.Bot: invalid headers regex")
ErrInvalidCIDR = errors.New("config.Bot: invalid CIDR")
ErrRegexEndsWithNewline = errors.New("config.Bot: regular expression ends with newline (try >- instead of > in yaml)")
ErrInvalidImportStatement = errors.New("config.ImportStatement: invalid source file")
ErrCantSetBotAndImportValuesAtOnce = errors.New("config.BotOrImport: can't set bot rules and import values at the same time")
ErrMustSetBotOrImportRules = errors.New("config.BotOrImport: rule definition is invalid, you must set either bot rules or an import statement, not both")
ErrStatusCodeNotValid = errors.New("config.StatusCode: status code not valid, must be between 100 and 599")
)
type Rule string
const (
RuleUnknown Rule = ""
RuleAllow Rule = "ALLOW"
RuleDeny Rule = "DENY"
RuleChallenge Rule = "CHALLENGE"
RuleWeigh Rule = "WEIGH"
RuleBenchmark Rule = "DEBUG_BENCHMARK"
)
const DefaultAlgorithm = "fast"
type BotConfig struct {
UserAgentRegex *string `json:"user_agent_regex,omitempty" yaml:"user_agent_regex,omitempty"`
PathRegex *string `json:"path_regex,omitempty" yaml:"path_regex,omitempty"`
HeadersRegex map[string]string `json:"headers_regex,omitempty" yaml:"headers_regex,omitempty"`
Expression *ExpressionOrList `json:"expression,omitempty" yaml:"expression,omitempty"`
Challenge *ChallengeRules `json:"challenge,omitempty" yaml:"challenge,omitempty"`
Weight *Weight `json:"weight,omitempty" yaml:"weight,omitempty"`
Name string `json:"name" yaml:"name"`
Action Rule `json:"action" yaml:"action"`
RemoteAddr []string `json:"remote_addresses,omitempty" yaml:"remote_addresses,omitempty"`
}
func (b BotConfig) Zero() bool {
for _, cond := range []bool{
b.Name != "",
b.UserAgentRegex != nil,
b.PathRegex != nil,
len(b.HeadersRegex) != 0,
b.Action != "",
len(b.RemoteAddr) != 0,
b.Challenge != nil,
} {
if cond {
return false
}
}
return true
}
func (b *BotConfig) Valid() error {
var errs []error
if b.Name == "" {
errs = append(errs, ErrBotMustHaveName)
}
allFieldsEmpty := b.UserAgentRegex == nil &&
b.PathRegex == nil &&
len(b.RemoteAddr) == 0 &&
len(b.HeadersRegex) == 0
if allFieldsEmpty && b.Expression == nil {
errs = append(errs, ErrBotMustHaveUserAgentOrPath)
}
if b.UserAgentRegex != nil && b.PathRegex != nil {
errs = append(errs, ErrBotMustHaveUserAgentOrPathNotBoth)
}
if b.UserAgentRegex != nil {
if strings.HasSuffix(*b.UserAgentRegex, "\n") {
errs = append(errs, fmt.Errorf("%w: user agent regex: %q", ErrRegexEndsWithNewline, *b.UserAgentRegex))
}
if _, err := regexp.Compile(*b.UserAgentRegex); err != nil {
errs = append(errs, ErrInvalidUserAgentRegex, err)
}
}
if b.PathRegex != nil {
if strings.HasSuffix(*b.PathRegex, "\n") {
errs = append(errs, fmt.Errorf("%w: path regex: %q", ErrRegexEndsWithNewline, *b.PathRegex))
}
if _, err := regexp.Compile(*b.PathRegex); err != nil {
errs = append(errs, ErrInvalidPathRegex, err)
}
}
if len(b.HeadersRegex) > 0 {
for name, expr := range b.HeadersRegex {
if name == "" {
continue
}
if strings.HasSuffix(expr, "\n") {
errs = append(errs, fmt.Errorf("%w: header %s regex: %q", ErrRegexEndsWithNewline, name, expr))
}
if _, err := regexp.Compile(expr); err != nil {
errs = append(errs, ErrInvalidHeadersRegex, err)
}
}
}
if len(b.RemoteAddr) > 0 {
for _, cidr := range b.RemoteAddr {
if _, _, err := net.ParseCIDR(cidr); err != nil {
errs = append(errs, ErrInvalidCIDR, err)
}
}
}
if b.Expression != nil {
if err := b.Expression.Valid(); err != nil {
errs = append(errs, err)
}
}
switch b.Action {
case RuleAllow, RuleBenchmark, RuleChallenge, RuleDeny, RuleWeigh:
// okay
default:
errs = append(errs, fmt.Errorf("%w: %q", ErrUnknownAction, b.Action))
}
if b.Action == RuleChallenge && b.Challenge != nil {
if err := b.Challenge.Valid(); err != nil {
errs = append(errs, err)
}
}
if b.Action == RuleWeigh && b.Weight == nil {
b.Weight = &Weight{Adjust: 5}
}
if len(errs) != 0 {
return fmt.Errorf("config: bot entry for %q is not valid:\n%w", b.Name, errors.Join(errs...))
}
return nil
}
type ChallengeRules struct {
Algorithm string `json:"algorithm,omitempty" yaml:"algorithm,omitempty"`
Difficulty int `json:"difficulty,omitempty" yaml:"difficulty,omitempty"`
ReportAs int `json:"report_as,omitempty" yaml:"report_as,omitempty"`
}
var (
ErrChallengeDifficultyTooLow = errors.New("config.Bot.ChallengeRules: difficulty is too low (must be >= 1)")
ErrChallengeDifficultyTooHigh = errors.New("config.Bot.ChallengeRules: difficulty is too high (must be <= 64)")
)
func (cr ChallengeRules) Valid() error {
var errs []error
if cr.Difficulty < 1 {
errs = append(errs, fmt.Errorf("%w, got: %d", ErrChallengeDifficultyTooLow, cr.Difficulty))
}
if cr.Difficulty > 64 {
errs = append(errs, fmt.Errorf("%w, got: %d", ErrChallengeDifficultyTooHigh, cr.Difficulty))
}
if len(errs) != 0 {
return fmt.Errorf("config: challenge rules entry is not valid:\n%w", errors.Join(errs...))
}
return nil
}
type ImportStatement struct {
Import string `json:"import"`
Bots []BotConfig
}
func (is *ImportStatement) open() (fs.File, error) {
if strings.HasPrefix(is.Import, "(data)/") {
fname := strings.TrimPrefix(is.Import, "(data)/")
fin, err := data.BotPolicies.Open(fname)
return fin, err
}
return os.Open(is.Import)
}
func (is *ImportStatement) load() error {
fin, err := is.open()
if err != nil {
return fmt.Errorf("%w: %s: %w", ErrInvalidImportStatement, is.Import, err)
}
defer fin.Close()
var imported []BotOrImport
var result []BotConfig
if err := yaml.NewYAMLToJSONDecoder(fin).Decode(&imported); err != nil {
return fmt.Errorf("can't parse %s: %w", is.Import, err)
}
var errs []error
for _, b := range imported {
if err := b.Valid(); err != nil {
errs = append(errs, err)
}
if b.ImportStatement != nil {
result = append(result, b.ImportStatement.Bots...)
}
if b.BotConfig != nil {
result = append(result, *b.BotConfig)
}
}
if len(errs) != 0 {
return fmt.Errorf("config %s is not valid:\n%w", is.Import, errors.Join(errs...))
}
is.Bots = result
return nil
}
func (is *ImportStatement) Valid() error {
return is.load()
}
type BotOrImport struct {
*BotConfig `json:",inline"`
*ImportStatement `json:",inline"`
}
func (boi *BotOrImport) Valid() error {
if boi.BotConfig != nil && boi.ImportStatement != nil {
return ErrCantSetBotAndImportValuesAtOnce
}
if boi.BotConfig != nil {
return boi.BotConfig.Valid()
}
if boi.ImportStatement != nil {
return boi.ImportStatement.Valid()
}
return ErrMustSetBotOrImportRules
}
type StatusCodes struct {
Challenge int `json:"CHALLENGE"`
Deny int `json:"DENY"`
}
func (sc StatusCodes) Valid() error {
var errs []error
if sc.Challenge == 0 || (sc.Challenge < 100 && sc.Challenge >= 599) {
errs = append(errs, fmt.Errorf("%w: challenge is %d", ErrStatusCodeNotValid, sc.Challenge))
}
if sc.Deny == 0 || (sc.Deny < 100 && sc.Deny >= 599) {
errs = append(errs, fmt.Errorf("%w: deny is %d", ErrStatusCodeNotValid, sc.Deny))
}
if len(errs) != 0 {
return fmt.Errorf("status codes not valid:\n%w", errors.Join(errs...))
}
return nil
}
type fileConfig struct {
Bots []BotOrImport `json:"bots"`
DNSBL bool `json:"dnsbl"`
StatusCodes StatusCodes `json:"status_codes"`
}
func (c fileConfig) Valid() error {
var errs []error
if len(c.Bots) == 0 {
errs = append(errs, ErrNoBotRulesDefined)
}
for _, b := range c.Bots {
if err := b.Valid(); err != nil {
errs = append(errs, err)
}
}
if err := c.StatusCodes.Valid(); err != nil {
errs = append(errs, err)
}
if len(errs) != 0 {
return fmt.Errorf("config is not valid:\n%w", errors.Join(errs...))
}
return nil
}
func Load(fin io.Reader, fname string) (*Config, error) {
var c fileConfig
c.StatusCodes = StatusCodes{
Challenge: http.StatusOK,
Deny: http.StatusOK,
}
if err := yaml.NewYAMLToJSONDecoder(fin).Decode(&c); err != nil {
return nil, fmt.Errorf("can't parse policy config YAML %s: %w", fname, err)
}
if err := c.Valid(); err != nil {
return nil, err
}
result := &Config{
DNSBL: c.DNSBL,
StatusCodes: c.StatusCodes,
}
var validationErrs []error
for _, boi := range c.Bots {
if boi.ImportStatement != nil {
if err := boi.load(); err != nil {
validationErrs = append(validationErrs, err)
continue
}
result.Bots = append(result.Bots, boi.ImportStatement.Bots...)
}
if boi.BotConfig != nil {
if err := boi.BotConfig.Valid(); err != nil {
validationErrs = append(validationErrs, err)
continue
}
result.Bots = append(result.Bots, *boi.BotConfig)
}
}
if len(validationErrs) > 0 {
return nil, fmt.Errorf("errors validating policy config %s: %w", fname, errors.Join(validationErrs...))
}
return result, nil
}
type Config struct {
Bots []BotConfig
DNSBL bool
StatusCodes StatusCodes
}
func (c Config) Valid() error {
var errs []error
if len(c.Bots) == 0 {
errs = append(errs, ErrNoBotRulesDefined)
}
for _, b := range c.Bots {
if err := b.Valid(); err != nil {
errs = append(errs, err)
}
}
if len(errs) != 0 {
return fmt.Errorf("config is not valid:\n%w", errors.Join(errs...))
}
return nil
}