From de7dbfe6d6cdca7cfc3d09ee8940ec5bb9999922 Mon Sep 17 00:00:00 2001
From: Corry Haines <github@corryh.com>
Date: Sun, 1 Jun 2025 13:21:18 -0700
Subject: [PATCH] Split up AI filtering files (#592)

* Split up AI filtering files

Create aggressive/moderate/permissive policies to allow administrators to choose their AI/LLM stance.

Aggressive policy matches existing default in Anubis.

Removes `Google-Extended` flag from `ai-robots-txt.yaml` as it doesn't exist in requests.

Rename `ai-robots-txt.yaml` to `ai-catchall.yaml` as the file is no longer a copy of the source repo/file.

* chore: spelling

* chore: fix embeds

* chore: fix data includes

* chore: fix file name typo

* chore: Ignore READMEs in configs

* chore(lib/policy/config): go tool goimports -w

Signed-off-by: Xe Iaso <me@xeiaso.net>

---------

Signed-off-by: Xe Iaso <me@xeiaso.net>
Co-authored-by: Xe Iaso <me@xeiaso.net>
---
 .github/actions/spelling/expect.txt           | 25 +++++++++++++++++++
 data/botPolicies.json                         |  2 +-
 data/botPolicies.yaml                         |  8 ++++--
 data/bots/ai-catchall.yaml                    | 11 ++++++++
 data/bots/ai-robots-txt.yaml                  |  6 -----
 data/clients/ai.yaml                          |  8 ++++++
 data/crawlers/ai-search.yaml                  |  8 ++++++
 data/crawlers/ai-training.yaml                |  8 ++++++
 data/embed.go                                 |  2 +-
 data/meta/README.md                           |  5 ++++
 data/meta/ai-block-aggressive.yaml            |  6 +++++
 data/meta/ai-block-moderate.yaml              |  7 ++++++
 data/meta/ai-block-permissive.yaml            |  6 +++++
 docs/docs/CHANGELOG.md                        |  2 ++
 docs/docs/admin/configuration/import.mdx      | 12 ++++-----
 lib/policy/config/config_test.go              |  4 +++
 .../config/testdata/bad/import_and_bot.json   |  2 +-
 .../config/testdata/bad/import_and_bot.yaml   |  2 +-
 yeetfile.js                                   |  1 +
 19 files changed, 107 insertions(+), 18 deletions(-)
 create mode 100644 data/bots/ai-catchall.yaml
 delete mode 100644 data/bots/ai-robots-txt.yaml
 create mode 100644 data/clients/ai.yaml
 create mode 100644 data/crawlers/ai-search.yaml
 create mode 100644 data/crawlers/ai-training.yaml
 create mode 100644 data/meta/README.md
 create mode 100644 data/meta/ai-block-aggressive.yaml
 create mode 100644 data/meta/ai-block-moderate.yaml
 create mode 100644 data/meta/ai-block-permissive.yaml

diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt
index b355b87..6bc08d5 100644
--- a/.github/actions/spelling/expect.txt
+++ b/.github/actions/spelling/expect.txt
@@ -18,7 +18,9 @@ blueskybot
 boi
 botnet
 BPort
+Brightbot
 broked
+Bytespider
 cachebuster
 Caddyfile
 caninetools
@@ -41,6 +43,7 @@ cloudflare
 confd
 containerbuild
 coreutils
+Cotoyogi
 CRDs
 crt
 daemonizing
@@ -49,6 +52,7 @@ Debian
 debrpm
 decaymap
 decompiling
+Diffbot
 discordapp
 discordbot
 distros
@@ -66,11 +70,15 @@ everyones
 evilbot
 evilsite
 expressionorlist
+externalagent
+externalfetcher
 extldflags
 facebookgo
+Factset
 fastcgi
 fediverse
 finfos
+Firecrawl
 flagenv
 Fordola
 forgejo
@@ -86,6 +94,7 @@ googlebot
 govulncheck
 GPG
 GPT
+gptbot
 grw
 Hashcash
 hashrate
@@ -97,8 +106,11 @@ hostable
 htmx
 httpdebug
 hypertext
+iaskspider
 iat
 ifm
+Imagesift
+imgproxy
 inp
 iss
 isset
@@ -146,11 +158,15 @@ nginx
 nobots
 NONINFRINGEMENT
 nosleep
+OCOB
 ogtags
+omgili
+omgilibot
 onionservice
 openai
 openrc
 pag
+Pangu
 parseable
 passthrough
 Patreon
@@ -185,18 +201,22 @@ RUnlock
 sas
 sasl
 Scumm
+searchbot
 searx
 sebest
 secretplans
 selfsigned
+Semrush
 setsebool
 shellcheck
+Sidetrade
 sitemap
 sls
 sni
 Sourceware
 Spambot
 sparkline
+spyderbot
 srv
 stackoverflow
 startprecmd
@@ -212,12 +232,15 @@ techarohq
 templ
 templruntime
 testarea
+Tik
+Timpibot
 torproject
 traefik
 unixhttpd
 unmarshal
 uvx
 Varis
+Velen
 vendored
 vhosts
 videotest
@@ -227,9 +250,11 @@ webmaster
 webpage
 websecure
 websites
+Webzio
 wordpress
 Workaround
 workdir
+wpbot
 xcaddy
 Xeact
 xeiaso
diff --git a/data/botPolicies.json b/data/botPolicies.json
index 5f24e99..6227639 100644
--- a/data/botPolicies.json
+++ b/data/botPolicies.json
@@ -4,7 +4,7 @@
       "import": "(data)/bots/_deny-pathological.yaml"
     },
     {
-      "import": "(data)/bots/ai-robots-txt.yaml"
+      "import": "(data)/meta/ai-block-aggressive.yaml"
     },
     {
       "import": "(data)/crawlers/_allow-good.yaml"
diff --git a/data/botPolicies.yaml b/data/botPolicies.yaml
index 917c417..5e444fe 100644
--- a/data/botPolicies.yaml
+++ b/data/botPolicies.yaml
@@ -17,8 +17,12 @@ bots:
     import: (data)/bots/_deny-pathological.yaml
   - import: (data)/bots/aggressive-brazilian-scrapers.yaml
 
-  # Enforce https://github.com/ai-robots-txt/ai.robots.txt
-  - import: (data)/bots/ai-robots-txt.yaml
+  # Aggressively block AI/LLM related bots/agents by default
+  - import: (data)/meta/ai-block-aggressive.yaml
+
+  # Consider replacing the aggressive AI policy with more selective policies:
+  # - import: (data)/meta/ai-block-moderate.yaml
+  # - import: (data)/meta/ai-block-permissive.yaml
 
   # Search engine crawlers to allow, defaults to:
   #   - Google (so they don't try to bypass Anubis)
diff --git a/data/bots/ai-catchall.yaml b/data/bots/ai-catchall.yaml
new file mode 100644
index 0000000..05dc6e1
--- /dev/null
+++ b/data/bots/ai-catchall.yaml
@@ -0,0 +1,11 @@
+# Extensive list of AI-affiliated agents based on https://github.com/ai-robots-txt/ai.robots.txt
+# Add new/undocumented agents here. Where documentation exists, consider moving to dedicated policy files.
+# Notes on various agents:
+#  - Amazonbot: Well documented, but they refuse to state which agent collects training data.
+#  - anthropic-ai/Claude-Web: Undocumented by Anthropic. Possibly deprecated or hallucinations?
+#  - Perplexity*: Well documented, but they refuse to state which agent collects training data.
+# Warning: May contain user agents that _must_ be blocked in robots.txt, or the opt-out will have no effect.
+- name: "ai-catchall"
+  user_agent_regex: >-
+    AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|anthropic-ai|Brightbot 1.0|Bytespider|CCBot|Claude-Web|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|GoogleOther|GoogleOther-Image|GoogleOther-Video|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|NovaAct|omgili|omgilibot|Operator|PanguBot|Perplexity-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YouBot
+  action: DENY
diff --git a/data/bots/ai-robots-txt.yaml b/data/bots/ai-robots-txt.yaml
deleted file mode 100644
index e515201..0000000
--- a/data/bots/ai-robots-txt.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-# Warning: Contains user agents that _must_ be blocked in robots.txt, or the opt-out will have no effect.
-# Note: Blocks human-directed/non-training user agents
-- name: "ai-robots-txt"
-  user_agent_regex: >-
-    AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|anthropic-ai|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-SearchBot|Claude-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|MistralAI-User/1.0|NovaAct|OAI-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YouBot
-  action: DENY
diff --git a/data/clients/ai.yaml b/data/clients/ai.yaml
new file mode 100644
index 0000000..0ec7022
--- /dev/null
+++ b/data/clients/ai.yaml
@@ -0,0 +1,8 @@
+# User agents that act on behalf of humans in AI tools, e.g. searching the web.
+# Each entry should have a positive/ALLOW entry created as well, with further documentation.
+# Exceptions:
+#  - Claude-User: No published IP allowlist
+- name: "ai-clients"
+  user_agent_regex: >-
+    ChatGPT-User|Claude-User|MistralAI-User
+  action: DENY
diff --git a/data/crawlers/ai-search.yaml b/data/crawlers/ai-search.yaml
new file mode 100644
index 0000000..91855bf
--- /dev/null
+++ b/data/crawlers/ai-search.yaml
@@ -0,0 +1,8 @@
+# User agents that index exclusively for search in for AI systems.
+# Each entry should have a positive/ALLOW entry created as well, with further documentation.
+# Exceptions:
+#  - Claude-SearchBot: No published IP allowlist
+- name: "ai-crawlers-search"
+  user_agent_regex: >-
+    OAI-SearchBot|Claude-SearchBot
+  action: DENY
diff --git a/data/crawlers/ai-training.yaml b/data/crawlers/ai-training.yaml
new file mode 100644
index 0000000..fc3ae34
--- /dev/null
+++ b/data/crawlers/ai-training.yaml
@@ -0,0 +1,8 @@
+# User agents that crawl for training AI/LLM systems
+# Each entry should have a positive/ALLOW entry created as well, with further documentation.
+# Exceptions:
+#  - ClaudeBot: No published IP allowlist
+- name: "ai-crawlers-training"
+  user_agent_regex: >-
+    GPTBot|ClaudeBot
+  action: DENY
diff --git a/data/embed.go b/data/embed.go
index 849c75f..c3ed06f 100644
--- a/data/embed.go
+++ b/data/embed.go
@@ -3,6 +3,6 @@ package data
 import "embed"
 
 var (
-	//go:embed botPolicies.yaml botPolicies.json all:apps all:bots all:clients all:common all:crawlers
+	//go:embed botPolicies.yaml botPolicies.json all:apps all:bots all:clients all:common all:crawlers all:meta
 	BotPolicies embed.FS
 )
diff --git a/data/meta/README.md b/data/meta/README.md
new file mode 100644
index 0000000..bf758c7
--- /dev/null
+++ b/data/meta/README.md
@@ -0,0 +1,5 @@
+# meta policies
+
+Contains policies that exclusively reference policies in _multiple_ other data folders.
+
+Akin to "stances" that the administrator can take, with reference to various topics, such as AI/LLM systems.
\ No newline at end of file
diff --git a/data/meta/ai-block-aggressive.yaml b/data/meta/ai-block-aggressive.yaml
new file mode 100644
index 0000000..f76e15e
--- /dev/null
+++ b/data/meta/ai-block-aggressive.yaml
@@ -0,0 +1,6 @@
+# Blocks all AI/LLM associated user agents, regardless of purpose or human agency
+# Warning: To completely block some AI/LLM training, such as with Google, you _must_ place flags in robots.txt.
+- import: (data)/bots/ai-catchall.yaml
+- import: (data)/clients/ai.yaml
+- import: (data)/crawlers/ai-search.yaml
+- import: (data)/crawlers/ai-training.yaml
\ No newline at end of file
diff --git a/data/meta/ai-block-moderate.yaml b/data/meta/ai-block-moderate.yaml
new file mode 100644
index 0000000..3fb5fb9
--- /dev/null
+++ b/data/meta/ai-block-moderate.yaml
@@ -0,0 +1,7 @@
+# Blocks all AI/LLM bots used for training or unknown/undocumented purposes.
+# Permits user agents with explicitly documented non-training use, and published IP allowlists.
+- import: (data)/bots/ai-catchall.yaml
+- import: (data)/crawlers/ai-training.yaml
+- import: (data)/crawlers/openai-searchbot.yaml
+- import: (data)/clients/openai-chatgpt-user.yaml
+- import: (data)/clients/mistral-mistralai-user.yaml
\ No newline at end of file
diff --git a/data/meta/ai-block-permissive.yaml b/data/meta/ai-block-permissive.yaml
new file mode 100644
index 0000000..09a3446
--- /dev/null
+++ b/data/meta/ai-block-permissive.yaml
@@ -0,0 +1,6 @@
+# Permits all well documented AI/LLM user agents with published IP allowlists.
+- import: (data)/bots/ai-catchall.yaml
+- import: (data)/crawlers/openai-searchbot.yaml
+- import: (data)/crawlers/openai-gptbot.yaml
+- import: (data)/clients/openai-chatgpt-user.yaml
+- import: (data)/clients/mistral-mistralai-user.yaml
\ No newline at end of file
diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md
index 45a18b3..cba3cbc 100644
--- a/docs/docs/CHANGELOG.md
+++ b/docs/docs/CHANGELOG.md
@@ -41,6 +41,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added `--version` flag.
 - Added `anubis_proxied_requests_total` metric to count proxied requests.
 - Add `Applebot` as "good" web crawler
+- Reorganize AI/LLM crawler blocking into three separate stances, maintaining existing status quo as default.
+- Split out AI/LLM user agent blocking policies, adding documentation for each.
 
 ## v1.18.0: Varis zos Galvus
 
diff --git a/docs/docs/admin/configuration/import.mdx b/docs/docs/admin/configuration/import.mdx
index 9a526e0..13b7992 100644
--- a/docs/docs/admin/configuration/import.mdx
+++ b/docs/docs/admin/configuration/import.mdx
@@ -14,7 +14,7 @@ EG:
 {
   "bots": [
     {
-      "import": "(data)/bots/ai-robots-txt.yaml"
+      "import": "(data)/bots/ai-catchall.yaml"
     },
     {
       "import": "(data)/bots/cloudflare-workers.yaml"
@@ -29,8 +29,8 @@ EG:
 ```yaml
 bots:
   # Pathological bots to deny
-  - # This correlates to data/bots/ai-robots-txt.yaml in the source tree
-    import: (data)/bots/ai-robots-txt.yaml
+  - # This correlates to data/bots/ai-catchall.yaml in the source tree
+    import: (data)/bots/ai-catchall.yaml
   - import: (data)/bots/cloudflare-workers.yaml
 ```
 
@@ -46,7 +46,7 @@ Of note, a bot rule can either have inline bot configuration or import a bot con
 {
   "bots": [
     {
-      "import": "(data)/bots/ai-robots-txt.yaml",
+      "import": "(data)/bots/ai-catchall.yaml",
       "name": "generic-browser",
       "user_agent_regex": "Mozilla|Opera\n",
       "action": "CHALLENGE"
@@ -60,7 +60,7 @@ Of note, a bot rule can either have inline bot configuration or import a bot con
 
 ```yaml
 bots:
-  - import: (data)/bots/ai-robots-txt.yaml
+  - import: (data)/bots/ai-catchall.yaml
     name: generic-browser
     user_agent_regex: >
       Mozilla|Opera
@@ -167,7 +167,7 @@ static
 ├── botPolicies.json
 ├── botPolicies.yaml
 ├── bots
-│   ├── ai-robots-txt.yaml
+│   ├── ai-catchall.yaml
 │   ├── cloudflare-workers.yaml
 │   ├── headless-browsers.yaml
 │   └── us-ai-scraper.yaml
diff --git a/lib/policy/config/config_test.go b/lib/policy/config/config_test.go
index afc1ab8..05515cd 100644
--- a/lib/policy/config/config_test.go
+++ b/lib/policy/config/config_test.go
@@ -251,6 +251,7 @@ func TestImportStatement(t *testing.T) {
 		"bots",
 		"common",
 		"crawlers",
+		"meta",
 	} {
 		if err := fs.WalkDir(data.BotPolicies, folderName, func(path string, d fs.DirEntry, err error) error {
 			if err != nil {
@@ -259,6 +260,9 @@ func TestImportStatement(t *testing.T) {
 			if d.IsDir() {
 				return nil
 			}
+			if d.Name() == "README.md" {
+				return nil
+			}
 
 			tests = append(tests, testCase{
 				name:       "(data)/" + path,
diff --git a/lib/policy/config/testdata/bad/import_and_bot.json b/lib/policy/config/testdata/bad/import_and_bot.json
index 7fa4255..3d0519b 100644
--- a/lib/policy/config/testdata/bad/import_and_bot.json
+++ b/lib/policy/config/testdata/bad/import_and_bot.json
@@ -1,7 +1,7 @@
 {
   "bots": [
     {
-      "import": "(data)/bots/ai-robots-txt.yaml",
+      "import": "(data)/bots/ai-catchall.yaml",
       "name": "generic-browser",
       "user_agent_regex": "Mozilla|Opera\n",
       "action": "CHALLENGE"
diff --git a/lib/policy/config/testdata/bad/import_and_bot.yaml b/lib/policy/config/testdata/bad/import_and_bot.yaml
index 0080b10..fdfaa43 100644
--- a/lib/policy/config/testdata/bad/import_and_bot.yaml
+++ b/lib/policy/config/testdata/bad/import_and_bot.yaml
@@ -1,5 +1,5 @@
 bots:
-- import: (data)/bots/ai-robots-txt.yaml
+- import: (data)/bots/ai-catchall.yaml
   name: generic-browser
   user_agent_regex: >
     Mozilla|Opera
diff --git a/yeetfile.js b/yeetfile.js
index 59dbf8c..6f806f2 100644
--- a/yeetfile.js
+++ b/yeetfile.js
@@ -35,6 +35,7 @@ $`npm run assets`;
             $`cp -a data/clients ${doc}/data/clients`;
             $`cp -a data/common ${doc}/data/common`;
             $`cp -a data/crawlers ${doc}/data/crawlers`;
+            $`cp -a data/meta ${doc}/data/meta`;
         },
     }));
 });