diff --git a/data/botPolicies.json b/data/botPolicies.json index 160bbf0..7af1d90 100644 --- a/data/botPolicies.json +++ b/data/botPolicies.json @@ -1,40 +1,13 @@ { "bots": [ + { + "import": "(data)/bots/_deny-pathological.yaml" + }, { "import": "(data)/bots/ai-robots-txt.yaml" }, { - "import": "(data)/bots/cloudflare-workers.yaml" - }, - { - "import": "(data)/bots/headless-browsers.yaml" - }, - { - "import": "(data)/bots/us-ai-scraper.yaml" - }, - { - "import": "(data)/crawlers/googlebot.yaml" - }, - { - "import": "(data)/crawlers/bingbot.yaml" - }, - { - "import": "(data)/crawlers/duckduckbot.yaml" - }, - { - "import": "(data)/crawlers/qwantbot.yaml" - }, - { - "import": "(data)/crawlers/internet-archive.yaml" - }, - { - "import": "(data)/crawlers/kagibot.yaml" - }, - { - "import": "(data)/crawlers/marginalia.yaml" - }, - { - "import": "(data)/crawlers/mojeekbot.yaml" + "import": "(data)/crawlers/_allow-good.yaml" }, { "import": "(data)/common/keep-internet-working.yaml" @@ -45,5 +18,9 @@ "action": "CHALLENGE" } ], - "dnsbl": false -} \ No newline at end of file + "dnsbl": false, + "status_codes": { + "CHALLENGE": 200, + "DENY": 200 + } +} diff --git a/data/botPolicies.yaml b/data/botPolicies.yaml index cd39780..e688491 100644 --- a/data/botPolicies.yaml +++ b/data/botPolicies.yaml @@ -12,21 +12,23 @@ bots: # Pathological bots to deny -- # This correlates to data/bots/ai-robots-txt.yaml in the source tree - import: (data)/bots/ai-robots-txt.yaml -- import: (data)/bots/cloudflare-workers.yaml -- import: (data)/bots/headless-browsers.yaml -- import: (data)/bots/us-ai-scraper.yaml +- # This correlates to data/bots/deny-pathological.yaml in the source tree + # https://github.com/TecharoHQ/anubis/blob/main/data/bots/deny-pathological.yaml + import: (data)/bots/_deny-pathological.yaml -# Search engines to allow -- import: (data)/crawlers/googlebot.yaml -- import: (data)/crawlers/bingbot.yaml -- import: (data)/crawlers/duckduckbot.yaml -- import: (data)/crawlers/qwantbot.yaml -- import: (data)/crawlers/internet-archive.yaml -- import: (data)/crawlers/kagibot.yaml -- import: (data)/crawlers/marginalia.yaml -- import: (data)/crawlers/mojeekbot.yaml +# Enforce https://github.com/ai-robots-txt/ai.robots.txt +- import: (data)/bots/ai-robots-txt.yaml + +# Search engine crawlers to allow, defaults to: +# - Google (so they don't try to bypass Anubis) +# - Bing +# - DuckDuckGo +# - Qwant +# - The Internet Archive +# - Kagi +# - Marginalia +# - Mojeek +- import: (data)/crawlers/_allow-good.yaml # Allow common "keeping the internet working" routes (well-known, favicon, robots.txt) - import: (data)/common/keep-internet-working.yaml diff --git a/data/bots/_deny-pathological.yaml b/data/bots/_deny-pathological.yaml new file mode 100644 index 0000000..09d4bfc --- /dev/null +++ b/data/bots/_deny-pathological.yaml @@ -0,0 +1,3 @@ +- import: (data)/bots/cloudflare-workers.yaml +- import: (data)/bots/headless-browsers.yaml +- import: (data)/bots/us-ai-scraper.yaml \ No newline at end of file diff --git a/data/crawlers/_allow-good.yaml b/data/crawlers/_allow-good.yaml new file mode 100644 index 0000000..f95e176 --- /dev/null +++ b/data/crawlers/_allow-good.yaml @@ -0,0 +1,8 @@ +- import: (data)/crawlers/googlebot.yaml +- import: (data)/crawlers/bingbot.yaml +- import: (data)/crawlers/duckduckbot.yaml +- import: (data)/crawlers/qwantbot.yaml +- import: (data)/crawlers/internet-archive.yaml +- import: (data)/crawlers/kagibot.yaml +- import: (data)/crawlers/marginalia.yaml +- import: (data)/crawlers/mojeekbot.yaml \ No newline at end of file diff --git a/data/embed.go b/data/embed.go index ebb2152..3e5278f 100644 --- a/data/embed.go +++ b/data/embed.go @@ -3,6 +3,6 @@ package data import "embed" var ( - //go:embed botPolicies.yaml botPolicies.json apps bots common crawlers + //go:embed botPolicies.yaml botPolicies.json all:apps all:bots all:common all:crawlers BotPolicies embed.FS ) diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md index 6bef2d8..0666c32 100644 --- a/docs/docs/CHANGELOG.md +++ b/docs/docs/CHANGELOG.md @@ -18,6 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added customization of authorization cookie expiration time with `--cookie-expiration-time` flag or envvar - Updated the `OG_PASSTHROUGH` to be true by default, thereby allowing OpenGraph tags to be passed through by default - Added the ability to [customize Anubis' HTTP status codes](./admin/configuration/custom-status-codes.mdx) ([#355](https://github.com/TecharoHQ/anubis/issues/355)) +- Change import syntax to allow multi-level imports ## v1.17.0: Asahi sas Brutus diff --git a/docs/docs/admin/configuration/import.mdx b/docs/docs/admin/configuration/import.mdx index 9934ce7..9a526e0 100644 --- a/docs/docs/admin/configuration/import.mdx +++ b/docs/docs/admin/configuration/import.mdx @@ -79,6 +79,45 @@ config.BotOrImport: rule definition is invalid, you must set either bot rules or Paths can either be prefixed with `(data)` to import from the [the data folder in the Anubis source tree](https://github.com/TecharoHQ/anubis/tree/main/data) or anywhere on the filesystem. If you don't have access to the Anubis source tree, check /usr/share/docs/anubis/data or in the tarball you extracted Anubis from. +## Importing from imports + +You can also import from an imported file in case you want to import an entire folder of rules at once. + + + + +```json +{ + "bots": [ + { + "import": "(data)/bots/_deny-pathological.yaml" + } + ] +} +``` + + + + +```yaml +bots: + - import: (data)/bots/_deny-pathological.yaml +``` + + + + +This lets you import an entire ruleset at once: + +```yaml +# (data)/bots/_deny-pathological.yaml +- import: (data)/bots/cloudflare-workers.yaml +- import: (data)/bots/headless-browsers.yaml +- import: (data)/bots/us-ai-scraper.yaml +``` + +Use this with care, you can easily get yourself into a state where Anubis recursively imports things for eternity if you are not careful. The best way to use this is to make a "root import" named `_everything.yaml` or `_allow-good.yaml` so they sort to the top. Name your meta-imports after the main verb they are enforcing so that you can glance at the configuration file and understand what it's doing. + ## Writing snippets Snippets can be written in either JSON or YAML, with a preference for YAML. When writing a snippet, write the bot rules you want directly at the top level of the file in a list. diff --git a/lib/policy/config/config.go b/lib/policy/config/config.go index 9dd61c9..2c51401 100644 --- a/lib/policy/config/config.go +++ b/lib/policy/config/config.go @@ -216,18 +216,27 @@ func (is *ImportStatement) load() error { } defer fin.Close() + var imported []BotOrImport var result []BotConfig - if err := yaml.NewYAMLToJSONDecoder(fin).Decode(&result); err != nil { + if err := yaml.NewYAMLToJSONDecoder(fin).Decode(&imported); err != nil { return fmt.Errorf("can't parse %s: %w", is.Import, err) } var errs []error - for _, b := range result { + for _, b := range imported { if err := b.Valid(); err != nil { errs = append(errs, err) } + + if b.ImportStatement != nil { + result = append(result, b.ImportStatement.Bots...) + } + + if b.BotConfig != nil { + result = append(result, *b.BotConfig) + } } if len(errs) != 0 {