diff --git a/data/botPolicies.yaml b/data/botPolicies.yaml index 51af499..cd39780 100644 --- a/data/botPolicies.yaml +++ b/data/botPolicies.yaml @@ -48,3 +48,11 @@ bots: action: CHALLENGE dnsbl: false + +# By default, send HTTP 200 back to clients that either get issued a challenge +# or a denial. This seems weird, but this is load-bearing due to the fact that +# the most aggressive scraper bots seem to really really want an HTTP 200 and +# will stop sending requests once they get it. +status_codes: + CHALLENGE: 200 + DENY: 200 \ No newline at end of file diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md index 277add0..d09a7dd 100644 --- a/docs/docs/CHANGELOG.md +++ b/docs/docs/CHANGELOG.md @@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- Added the ability to [customize Anubis' HTTP status codes](./admin/configuration/custom-status-codes.mdx) ([#355](https://github.com/TecharoHQ/anubis/issues/355)) + ## v1.17.0: Asahi sas Brutus - Ensure regexes can't end in newlines ([#372](https://github.com/TecharoHQ/anubis/issues/372)) diff --git a/docs/docs/admin/configuration/custom-status-codes.mdx b/docs/docs/admin/configuration/custom-status-codes.mdx new file mode 100644 index 0000000..cebb32d --- /dev/null +++ b/docs/docs/admin/configuration/custom-status-codes.mdx @@ -0,0 +1,19 @@ +# Custom status codes for Anubis errors + +Out of the box, Anubis will reply with `HTTP 200` for challenge and denial pages. This is intended to make AI scrapers have a hard time with your website because when they are faced with a non-200 response, they will hammer the page over and over until they get a 200 response. This behavior may not be desirable, as such Anubis lets you customize what HTTP status codes are returned when Anubis throws challenge and denial pages. + +This is configured in the `status_codes` block of your [bot policy file](../policies.mdx): + +```yaml +status_codes: + CHALLENGE: 200 + DENY: 200 +``` + +To match CloudFlare's behavior, use a configuration like this: + +```yaml +status_codes: + CHALLENGE: 403 + DENY: 403 +``` diff --git a/lib/anubis.go b/lib/anubis.go index bc14284..2a7add6 100644 --- a/lib/anubis.go +++ b/lib/anubis.go @@ -170,7 +170,7 @@ func (s *Server) checkRules(w http.ResponseWriter, r *http.Request, cr policy.Ch hash := rule.Hash() lg.Debug("rule hash", "hash", hash) - s.respondWithStatus(w, r, fmt.Sprintf("Access Denied: error code %s", hash), http.StatusOK) + s.respondWithStatus(w, r, fmt.Sprintf("Access Denied: error code %s", hash), s.policy.StatusCodes.Deny) return true case config.RuleChallenge: lg.Debug("challenge requested") @@ -202,7 +202,7 @@ func (s *Server) handleDNSBL(w http.ResponseWriter, r *http.Request, ip string, if resp != dnsbl.AllGood { lg.Info("DNSBL hit", "status", resp.String()) - s.respondWithStatus(w, r, fmt.Sprintf("DroneBL reported an entry: %s, see https://dronebl.org/lookup?ip=%s", resp.String(), ip), http.StatusOK) + s.respondWithStatus(w, r, fmt.Sprintf("DroneBL reported an entry: %s, see https://dronebl.org/lookup?ip=%s", resp.String(), ip), s.policy.StatusCodes.Deny) return true } } diff --git a/lib/anubis_test.go b/lib/anubis_test.go index 4f3a165..63e76f7 100644 --- a/lib/anubis_test.go +++ b/lib/anubis_test.go @@ -393,3 +393,48 @@ func TestBasePrefix(t *testing.T) { }) } } + +func TestCustomStatusCodes(t *testing.T) { + h := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + t.Log(r.UserAgent()) + w.WriteHeader(http.StatusOK) + fmt.Fprintln(w, "OK") + }) + + statusMap := map[string]int{ + "ALLOW": 200, + "CHALLENGE": 401, + "DENY": 403, + } + + pol := loadPolicies(t, "./testdata/aggressive_403.yaml") + pol.DefaultDifficulty = 4 + + srv := spawnAnubis(t, Options{ + Next: h, + Policy: pol, + }) + + ts := httptest.NewServer(internal.RemoteXRealIP(true, "tcp", srv)) + defer ts.Close() + + for userAgent, statusCode := range statusMap { + t.Run(userAgent, func(t *testing.T) { + req, err := http.NewRequestWithContext(t.Context(), http.MethodGet, ts.URL, nil) + if err != nil { + t.Fatal(err) + } + + req.Header.Set("User-Agent", userAgent) + + resp, err := ts.Client().Do(req) + if err != nil { + t.Fatal(err) + } + + if resp.StatusCode != statusCode { + t.Errorf("wanted status code %d but got: %d", statusCode, resp.StatusCode) + } + }) + } +} diff --git a/lib/http.go b/lib/http.go index bfcddfe..ac7000c 100644 --- a/lib/http.go +++ b/lib/http.go @@ -67,7 +67,10 @@ func (s *Server) RenderIndex(w http.ResponseWriter, r *http.Request, rule *polic return } - handler := internal.NoStoreCache(templ.Handler(component)) + handler := internal.NoStoreCache(templ.Handler( + component, + templ.WithStatus(s.opts.Policy.StatusCodes.Challenge), + )) handler.ServeHTTP(w, r) } diff --git a/lib/policy/config/config.go b/lib/policy/config/config.go index c670bac..9dd61c9 100644 --- a/lib/policy/config/config.go +++ b/lib/policy/config/config.go @@ -6,6 +6,7 @@ import ( "io" "io/fs" "net" + "net/http" "os" "regexp" "strings" @@ -28,6 +29,7 @@ var ( ErrInvalidImportStatement = errors.New("config.ImportStatement: invalid source file") ErrCantSetBotAndImportValuesAtOnce = errors.New("config.BotOrImport: can't set bot rules and import values at the same time") ErrMustSetBotOrImportRules = errors.New("config.BotOrImport: rule definition is invalid, you must set either bot rules or an import statement, not both") + ErrStatusCodeNotValid = errors.New("config.StatusCode: status code not valid, must be between 100 and 599") ) type Rule string @@ -262,9 +264,33 @@ func (boi *BotOrImport) Valid() error { return ErrMustSetBotOrImportRules } +type StatusCodes struct { + Challenge int `json:"CHALLENGE"` + Deny int `json:"DENY"` +} + +func (sc StatusCodes) Valid() error { + var errs []error + + if sc.Challenge == 0 || (sc.Challenge < 100 && sc.Challenge >= 599) { + errs = append(errs, fmt.Errorf("%w: challenge is %d", ErrStatusCodeNotValid, sc.Challenge)) + } + + if sc.Deny == 0 || (sc.Deny < 100 && sc.Deny >= 599) { + errs = append(errs, fmt.Errorf("%w: deny is %d", ErrStatusCodeNotValid, sc.Deny)) + } + + if len(errs) != 0 { + return fmt.Errorf("status codes not valid:\n%w", errors.Join(errs...)) + } + + return nil +} + type fileConfig struct { - Bots []BotOrImport `json:"bots"` - DNSBL bool `json:"dnsbl"` + Bots []BotOrImport `json:"bots"` + DNSBL bool `json:"dnsbl"` + StatusCodes StatusCodes `json:"status_codes"` } func (c fileConfig) Valid() error { @@ -280,6 +306,10 @@ func (c fileConfig) Valid() error { } } + if err := c.StatusCodes.Valid(); err != nil { + errs = append(errs, err) + } + if len(errs) != 0 { return fmt.Errorf("config is not valid:\n%w", errors.Join(errs...)) } @@ -289,6 +319,10 @@ func (c fileConfig) Valid() error { func Load(fin io.Reader, fname string) (*Config, error) { var c fileConfig + c.StatusCodes = StatusCodes{ + Challenge: http.StatusOK, + Deny: http.StatusOK, + } if err := yaml.NewYAMLToJSONDecoder(fin).Decode(&c); err != nil { return nil, fmt.Errorf("can't parse policy config YAML %s: %w", fname, err) } @@ -298,7 +332,8 @@ func Load(fin io.Reader, fname string) (*Config, error) { } result := &Config{ - DNSBL: c.DNSBL, + DNSBL: c.DNSBL, + StatusCodes: c.StatusCodes, } var validationErrs []error @@ -331,8 +366,9 @@ func Load(fin io.Reader, fname string) (*Config, error) { } type Config struct { - Bots []BotConfig - DNSBL bool + Bots []BotConfig + DNSBL bool + StatusCodes StatusCodes } func (c Config) Valid() error { diff --git a/lib/policy/config/testdata/bad/status-codes-0.json b/lib/policy/config/testdata/bad/status-codes-0.json new file mode 100644 index 0000000..90f7837 --- /dev/null +++ b/lib/policy/config/testdata/bad/status-codes-0.json @@ -0,0 +1,13 @@ +{ + "bots": [ + { + "name": "everything", + "user_agent_regex": ".*", + "action": "DENY" + } + ], + "status_codes": { + "CHALLENGE": 0, + "DENY": 0 + } +} diff --git a/lib/policy/config/testdata/bad/status-codes-0.yaml b/lib/policy/config/testdata/bad/status-codes-0.yaml new file mode 100644 index 0000000..0d08322 --- /dev/null +++ b/lib/policy/config/testdata/bad/status-codes-0.yaml @@ -0,0 +1,8 @@ +bots: +- name: everything + user_agent_regex: .* + action: DENY + +status_codes: + CHALLENGE: 0 + DENY: 0 \ No newline at end of file diff --git a/lib/policy/config/testdata/good/status-codes-paranoid.json b/lib/policy/config/testdata/good/status-codes-paranoid.json new file mode 100644 index 0000000..f84dde9 --- /dev/null +++ b/lib/policy/config/testdata/good/status-codes-paranoid.json @@ -0,0 +1,13 @@ +{ + "bots": [ + { + "name": "everything", + "user_agent_regex": ".*", + "action": "DENY" + } + ], + "status_codes": { + "CHALLENGE": 200, + "DENY": 200 + } +} diff --git a/lib/policy/config/testdata/good/status-codes-paranoid.yaml b/lib/policy/config/testdata/good/status-codes-paranoid.yaml new file mode 100644 index 0000000..89655a3 --- /dev/null +++ b/lib/policy/config/testdata/good/status-codes-paranoid.yaml @@ -0,0 +1,8 @@ +bots: +- name: everything + user_agent_regex: .* + action: DENY + +status_codes: + CHALLENGE: 200 + DENY: 200 \ No newline at end of file diff --git a/lib/policy/config/testdata/good/status-codes-rfc.json b/lib/policy/config/testdata/good/status-codes-rfc.json new file mode 100644 index 0000000..2fdaac0 --- /dev/null +++ b/lib/policy/config/testdata/good/status-codes-rfc.json @@ -0,0 +1,13 @@ +{ + "bots": [ + { + "name": "everything", + "user_agent_regex": ".*", + "action": "DENY" + } + ], + "status_codes": { + "CHALLENGE": 403, + "DENY": 403 + } +} diff --git a/lib/policy/config/testdata/good/status-codes-rfc.yaml b/lib/policy/config/testdata/good/status-codes-rfc.yaml new file mode 100644 index 0000000..4e4e6d8 --- /dev/null +++ b/lib/policy/config/testdata/good/status-codes-rfc.yaml @@ -0,0 +1,8 @@ +bots: +- name: everything + user_agent_regex: .* + action: DENY + +status_codes: + CHALLENGE: 403 + DENY: 403 \ No newline at end of file diff --git a/lib/policy/policy.go b/lib/policy/policy.go index 7c45ff6..d5d1188 100644 --- a/lib/policy/policy.go +++ b/lib/policy/policy.go @@ -24,11 +24,13 @@ type ParsedConfig struct { Bots []Bot DNSBL bool DefaultDifficulty int + StatusCodes config.StatusCodes } func NewParsedConfig(orig *config.Config) *ParsedConfig { return &ParsedConfig{ - orig: orig, + orig: orig, + StatusCodes: orig.StatusCodes, } } diff --git a/lib/testdata/aggressive_403.yaml b/lib/testdata/aggressive_403.yaml new file mode 100644 index 0000000..facafd6 --- /dev/null +++ b/lib/testdata/aggressive_403.yaml @@ -0,0 +1,12 @@ +bots: +- name: deny + user_agent_regex: DENY + action: DENY + +- name: challenge + user_agent_regex: CHALLENGE + action: CHALLENGE + +status_codes: + CHALLENGE: 401 + DENY: 403 \ No newline at end of file diff --git a/test/anubis_configs/aggressive_403.yaml b/test/anubis_configs/aggressive_403.yaml new file mode 100644 index 0000000..facafd6 --- /dev/null +++ b/test/anubis_configs/aggressive_403.yaml @@ -0,0 +1,12 @@ +bots: +- name: deny + user_agent_regex: DENY + action: DENY + +- name: challenge + user_agent_regex: CHALLENGE + action: CHALLENGE + +status_codes: + CHALLENGE: 401 + DENY: 403 \ No newline at end of file diff --git a/test/unix-socket-xff/start.sh b/test/unix-socket-xff/start.sh index 2d13fba..840d768 100755 --- a/test/unix-socket-xff/start.sh +++ b/test/unix-socket-xff/start.sh @@ -37,6 +37,7 @@ go run ../cmd/unixhttpd & go tool anubis \ --bind=./anubis.sock \ --bind-network=unix \ + --policy-fname=../anubis_configs/aggressive_403.yaml \ --target=unix://$(pwd)/unixhttpd.sock & # A simple TLS terminator that forwards to Anubis, which will forward to diff --git a/test/unix-socket-xff/test.mjs b/test/unix-socket-xff/test.mjs new file mode 100644 index 0000000..8b8479c --- /dev/null +++ b/test/unix-socket-xff/test.mjs @@ -0,0 +1,30 @@ +async function testWithUserAgent(userAgent) { + const statusCode = + await fetch("https://relayd.local.cetacean.club:3004/reqmeta", { + headers: { + "User-Agent": userAgent, + } + }) + .then(resp => resp.status); + return statusCode; +} + +const codes = { + allow: await testWithUserAgent("ALLOW"), + challenge: await testWithUserAgent("CHALLENGE"), + deny: await testWithUserAgent("DENY") +} + +const expected = { + allow: 200, + challenge: 401, + deny: 403, +}; + +console.log("ALLOW: ", codes.allow); +console.log("CHALLENGE:", codes.challenge); +console.log("DENY: ", codes.deny); + +if (JSON.stringify(codes) !== JSON.stringify(expected)) { + throw new Error(`wanted ${JSON.stringify(expected)}, got: ${JSON.stringify(codes)}`); +} \ No newline at end of file