From de7dbfe6d6cdca7cfc3d09ee8940ec5bb9999922 Mon Sep 17 00:00:00 2001 From: Corry Haines Date: Sun, 1 Jun 2025 13:21:18 -0700 Subject: [PATCH] Split up AI filtering files (#592) * Split up AI filtering files Create aggressive/moderate/permissive policies to allow administrators to choose their AI/LLM stance. Aggressive policy matches existing default in Anubis. Removes `Google-Extended` flag from `ai-robots-txt.yaml` as it doesn't exist in requests. Rename `ai-robots-txt.yaml` to `ai-catchall.yaml` as the file is no longer a copy of the source repo/file. * chore: spelling * chore: fix embeds * chore: fix data includes * chore: fix file name typo * chore: Ignore READMEs in configs * chore(lib/policy/config): go tool goimports -w Signed-off-by: Xe Iaso --------- Signed-off-by: Xe Iaso Co-authored-by: Xe Iaso --- .github/actions/spelling/expect.txt | 25 +++++++++++++++++++ data/botPolicies.json | 2 +- data/botPolicies.yaml | 8 ++++-- data/bots/ai-catchall.yaml | 11 ++++++++ data/bots/ai-robots-txt.yaml | 6 ----- data/clients/ai.yaml | 8 ++++++ data/crawlers/ai-search.yaml | 8 ++++++ data/crawlers/ai-training.yaml | 8 ++++++ data/embed.go | 2 +- data/meta/README.md | 5 ++++ data/meta/ai-block-aggressive.yaml | 6 +++++ data/meta/ai-block-moderate.yaml | 7 ++++++ data/meta/ai-block-permissive.yaml | 6 +++++ docs/docs/CHANGELOG.md | 2 ++ docs/docs/admin/configuration/import.mdx | 12 ++++----- lib/policy/config/config_test.go | 4 +++ .../config/testdata/bad/import_and_bot.json | 2 +- .../config/testdata/bad/import_and_bot.yaml | 2 +- yeetfile.js | 1 + 19 files changed, 107 insertions(+), 18 deletions(-) create mode 100644 data/bots/ai-catchall.yaml delete mode 100644 data/bots/ai-robots-txt.yaml create mode 100644 data/clients/ai.yaml create mode 100644 data/crawlers/ai-search.yaml create mode 100644 data/crawlers/ai-training.yaml create mode 100644 data/meta/README.md create mode 100644 data/meta/ai-block-aggressive.yaml create mode 100644 data/meta/ai-block-moderate.yaml create mode 100644 data/meta/ai-block-permissive.yaml diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt index b355b87..6bc08d5 100644 --- a/.github/actions/spelling/expect.txt +++ b/.github/actions/spelling/expect.txt @@ -18,7 +18,9 @@ blueskybot boi botnet BPort +Brightbot broked +Bytespider cachebuster Caddyfile caninetools @@ -41,6 +43,7 @@ cloudflare confd containerbuild coreutils +Cotoyogi CRDs crt daemonizing @@ -49,6 +52,7 @@ Debian debrpm decaymap decompiling +Diffbot discordapp discordbot distros @@ -66,11 +70,15 @@ everyones evilbot evilsite expressionorlist +externalagent +externalfetcher extldflags facebookgo +Factset fastcgi fediverse finfos +Firecrawl flagenv Fordola forgejo @@ -86,6 +94,7 @@ googlebot govulncheck GPG GPT +gptbot grw Hashcash hashrate @@ -97,8 +106,11 @@ hostable htmx httpdebug hypertext +iaskspider iat ifm +Imagesift +imgproxy inp iss isset @@ -146,11 +158,15 @@ nginx nobots NONINFRINGEMENT nosleep +OCOB ogtags +omgili +omgilibot onionservice openai openrc pag +Pangu parseable passthrough Patreon @@ -185,18 +201,22 @@ RUnlock sas sasl Scumm +searchbot searx sebest secretplans selfsigned +Semrush setsebool shellcheck +Sidetrade sitemap sls sni Sourceware Spambot sparkline +spyderbot srv stackoverflow startprecmd @@ -212,12 +232,15 @@ techarohq templ templruntime testarea +Tik +Timpibot torproject traefik unixhttpd unmarshal uvx Varis +Velen vendored vhosts videotest @@ -227,9 +250,11 @@ webmaster webpage websecure websites +Webzio wordpress Workaround workdir +wpbot xcaddy Xeact xeiaso diff --git a/data/botPolicies.json b/data/botPolicies.json index 5f24e99..6227639 100644 --- a/data/botPolicies.json +++ b/data/botPolicies.json @@ -4,7 +4,7 @@ "import": "(data)/bots/_deny-pathological.yaml" }, { - "import": "(data)/bots/ai-robots-txt.yaml" + "import": "(data)/meta/ai-block-aggressive.yaml" }, { "import": "(data)/crawlers/_allow-good.yaml" diff --git a/data/botPolicies.yaml b/data/botPolicies.yaml index 917c417..5e444fe 100644 --- a/data/botPolicies.yaml +++ b/data/botPolicies.yaml @@ -17,8 +17,12 @@ bots: import: (data)/bots/_deny-pathological.yaml - import: (data)/bots/aggressive-brazilian-scrapers.yaml - # Enforce https://github.com/ai-robots-txt/ai.robots.txt - - import: (data)/bots/ai-robots-txt.yaml + # Aggressively block AI/LLM related bots/agents by default + - import: (data)/meta/ai-block-aggressive.yaml + + # Consider replacing the aggressive AI policy with more selective policies: + # - import: (data)/meta/ai-block-moderate.yaml + # - import: (data)/meta/ai-block-permissive.yaml # Search engine crawlers to allow, defaults to: # - Google (so they don't try to bypass Anubis) diff --git a/data/bots/ai-catchall.yaml b/data/bots/ai-catchall.yaml new file mode 100644 index 0000000..05dc6e1 --- /dev/null +++ b/data/bots/ai-catchall.yaml @@ -0,0 +1,11 @@ +# Extensive list of AI-affiliated agents based on https://github.com/ai-robots-txt/ai.robots.txt +# Add new/undocumented agents here. Where documentation exists, consider moving to dedicated policy files. +# Notes on various agents: +# - Amazonbot: Well documented, but they refuse to state which agent collects training data. +# - anthropic-ai/Claude-Web: Undocumented by Anthropic. Possibly deprecated or hallucinations? +# - Perplexity*: Well documented, but they refuse to state which agent collects training data. +# Warning: May contain user agents that _must_ be blocked in robots.txt, or the opt-out will have no effect. +- name: "ai-catchall" + user_agent_regex: >- + AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|anthropic-ai|Brightbot 1.0|Bytespider|CCBot|Claude-Web|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|GoogleOther|GoogleOther-Image|GoogleOther-Video|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|NovaAct|omgili|omgilibot|Operator|PanguBot|Perplexity-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YouBot + action: DENY diff --git a/data/bots/ai-robots-txt.yaml b/data/bots/ai-robots-txt.yaml deleted file mode 100644 index e515201..0000000 --- a/data/bots/ai-robots-txt.yaml +++ /dev/null @@ -1,6 +0,0 @@ -# Warning: Contains user agents that _must_ be blocked in robots.txt, or the opt-out will have no effect. -# Note: Blocks human-directed/non-training user agents -- name: "ai-robots-txt" - user_agent_regex: >- - AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|anthropic-ai|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-SearchBot|Claude-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|MistralAI-User/1.0|NovaAct|OAI-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YouBot - action: DENY diff --git a/data/clients/ai.yaml b/data/clients/ai.yaml new file mode 100644 index 0000000..0ec7022 --- /dev/null +++ b/data/clients/ai.yaml @@ -0,0 +1,8 @@ +# User agents that act on behalf of humans in AI tools, e.g. searching the web. +# Each entry should have a positive/ALLOW entry created as well, with further documentation. +# Exceptions: +# - Claude-User: No published IP allowlist +- name: "ai-clients" + user_agent_regex: >- + ChatGPT-User|Claude-User|MistralAI-User + action: DENY diff --git a/data/crawlers/ai-search.yaml b/data/crawlers/ai-search.yaml new file mode 100644 index 0000000..91855bf --- /dev/null +++ b/data/crawlers/ai-search.yaml @@ -0,0 +1,8 @@ +# User agents that index exclusively for search in for AI systems. +# Each entry should have a positive/ALLOW entry created as well, with further documentation. +# Exceptions: +# - Claude-SearchBot: No published IP allowlist +- name: "ai-crawlers-search" + user_agent_regex: >- + OAI-SearchBot|Claude-SearchBot + action: DENY diff --git a/data/crawlers/ai-training.yaml b/data/crawlers/ai-training.yaml new file mode 100644 index 0000000..fc3ae34 --- /dev/null +++ b/data/crawlers/ai-training.yaml @@ -0,0 +1,8 @@ +# User agents that crawl for training AI/LLM systems +# Each entry should have a positive/ALLOW entry created as well, with further documentation. +# Exceptions: +# - ClaudeBot: No published IP allowlist +- name: "ai-crawlers-training" + user_agent_regex: >- + GPTBot|ClaudeBot + action: DENY diff --git a/data/embed.go b/data/embed.go index 849c75f..c3ed06f 100644 --- a/data/embed.go +++ b/data/embed.go @@ -3,6 +3,6 @@ package data import "embed" var ( - //go:embed botPolicies.yaml botPolicies.json all:apps all:bots all:clients all:common all:crawlers + //go:embed botPolicies.yaml botPolicies.json all:apps all:bots all:clients all:common all:crawlers all:meta BotPolicies embed.FS ) diff --git a/data/meta/README.md b/data/meta/README.md new file mode 100644 index 0000000..bf758c7 --- /dev/null +++ b/data/meta/README.md @@ -0,0 +1,5 @@ +# meta policies + +Contains policies that exclusively reference policies in _multiple_ other data folders. + +Akin to "stances" that the administrator can take, with reference to various topics, such as AI/LLM systems. \ No newline at end of file diff --git a/data/meta/ai-block-aggressive.yaml b/data/meta/ai-block-aggressive.yaml new file mode 100644 index 0000000..f76e15e --- /dev/null +++ b/data/meta/ai-block-aggressive.yaml @@ -0,0 +1,6 @@ +# Blocks all AI/LLM associated user agents, regardless of purpose or human agency +# Warning: To completely block some AI/LLM training, such as with Google, you _must_ place flags in robots.txt. +- import: (data)/bots/ai-catchall.yaml +- import: (data)/clients/ai.yaml +- import: (data)/crawlers/ai-search.yaml +- import: (data)/crawlers/ai-training.yaml \ No newline at end of file diff --git a/data/meta/ai-block-moderate.yaml b/data/meta/ai-block-moderate.yaml new file mode 100644 index 0000000..3fb5fb9 --- /dev/null +++ b/data/meta/ai-block-moderate.yaml @@ -0,0 +1,7 @@ +# Blocks all AI/LLM bots used for training or unknown/undocumented purposes. +# Permits user agents with explicitly documented non-training use, and published IP allowlists. +- import: (data)/bots/ai-catchall.yaml +- import: (data)/crawlers/ai-training.yaml +- import: (data)/crawlers/openai-searchbot.yaml +- import: (data)/clients/openai-chatgpt-user.yaml +- import: (data)/clients/mistral-mistralai-user.yaml \ No newline at end of file diff --git a/data/meta/ai-block-permissive.yaml b/data/meta/ai-block-permissive.yaml new file mode 100644 index 0000000..09a3446 --- /dev/null +++ b/data/meta/ai-block-permissive.yaml @@ -0,0 +1,6 @@ +# Permits all well documented AI/LLM user agents with published IP allowlists. +- import: (data)/bots/ai-catchall.yaml +- import: (data)/crawlers/openai-searchbot.yaml +- import: (data)/crawlers/openai-gptbot.yaml +- import: (data)/clients/openai-chatgpt-user.yaml +- import: (data)/clients/mistral-mistralai-user.yaml \ No newline at end of file diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md index 45a18b3..cba3cbc 100644 --- a/docs/docs/CHANGELOG.md +++ b/docs/docs/CHANGELOG.md @@ -41,6 +41,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added `--version` flag. - Added `anubis_proxied_requests_total` metric to count proxied requests. - Add `Applebot` as "good" web crawler +- Reorganize AI/LLM crawler blocking into three separate stances, maintaining existing status quo as default. +- Split out AI/LLM user agent blocking policies, adding documentation for each. ## v1.18.0: Varis zos Galvus diff --git a/docs/docs/admin/configuration/import.mdx b/docs/docs/admin/configuration/import.mdx index 9a526e0..13b7992 100644 --- a/docs/docs/admin/configuration/import.mdx +++ b/docs/docs/admin/configuration/import.mdx @@ -14,7 +14,7 @@ EG: { "bots": [ { - "import": "(data)/bots/ai-robots-txt.yaml" + "import": "(data)/bots/ai-catchall.yaml" }, { "import": "(data)/bots/cloudflare-workers.yaml" @@ -29,8 +29,8 @@ EG: ```yaml bots: # Pathological bots to deny - - # This correlates to data/bots/ai-robots-txt.yaml in the source tree - import: (data)/bots/ai-robots-txt.yaml + - # This correlates to data/bots/ai-catchall.yaml in the source tree + import: (data)/bots/ai-catchall.yaml - import: (data)/bots/cloudflare-workers.yaml ``` @@ -46,7 +46,7 @@ Of note, a bot rule can either have inline bot configuration or import a bot con { "bots": [ { - "import": "(data)/bots/ai-robots-txt.yaml", + "import": "(data)/bots/ai-catchall.yaml", "name": "generic-browser", "user_agent_regex": "Mozilla|Opera\n", "action": "CHALLENGE" @@ -60,7 +60,7 @@ Of note, a bot rule can either have inline bot configuration or import a bot con ```yaml bots: - - import: (data)/bots/ai-robots-txt.yaml + - import: (data)/bots/ai-catchall.yaml name: generic-browser user_agent_regex: > Mozilla|Opera @@ -167,7 +167,7 @@ static ├── botPolicies.json ├── botPolicies.yaml ├── bots -│ ├── ai-robots-txt.yaml +│ ├── ai-catchall.yaml │ ├── cloudflare-workers.yaml │ ├── headless-browsers.yaml │ └── us-ai-scraper.yaml diff --git a/lib/policy/config/config_test.go b/lib/policy/config/config_test.go index afc1ab8..05515cd 100644 --- a/lib/policy/config/config_test.go +++ b/lib/policy/config/config_test.go @@ -251,6 +251,7 @@ func TestImportStatement(t *testing.T) { "bots", "common", "crawlers", + "meta", } { if err := fs.WalkDir(data.BotPolicies, folderName, func(path string, d fs.DirEntry, err error) error { if err != nil { @@ -259,6 +260,9 @@ func TestImportStatement(t *testing.T) { if d.IsDir() { return nil } + if d.Name() == "README.md" { + return nil + } tests = append(tests, testCase{ name: "(data)/" + path, diff --git a/lib/policy/config/testdata/bad/import_and_bot.json b/lib/policy/config/testdata/bad/import_and_bot.json index 7fa4255..3d0519b 100644 --- a/lib/policy/config/testdata/bad/import_and_bot.json +++ b/lib/policy/config/testdata/bad/import_and_bot.json @@ -1,7 +1,7 @@ { "bots": [ { - "import": "(data)/bots/ai-robots-txt.yaml", + "import": "(data)/bots/ai-catchall.yaml", "name": "generic-browser", "user_agent_regex": "Mozilla|Opera\n", "action": "CHALLENGE" diff --git a/lib/policy/config/testdata/bad/import_and_bot.yaml b/lib/policy/config/testdata/bad/import_and_bot.yaml index 0080b10..fdfaa43 100644 --- a/lib/policy/config/testdata/bad/import_and_bot.yaml +++ b/lib/policy/config/testdata/bad/import_and_bot.yaml @@ -1,5 +1,5 @@ bots: -- import: (data)/bots/ai-robots-txt.yaml +- import: (data)/bots/ai-catchall.yaml name: generic-browser user_agent_regex: > Mozilla|Opera diff --git a/yeetfile.js b/yeetfile.js index 59dbf8c..6f806f2 100644 --- a/yeetfile.js +++ b/yeetfile.js @@ -35,6 +35,7 @@ $`npm run assets`; $`cp -a data/clients ${doc}/data/clients`; $`cp -a data/common ${doc}/data/common`; $`cp -a data/crawlers ${doc}/data/crawlers`; + $`cp -a data/meta ${doc}/data/meta`; }, })); });