Split up AI filtering files (#592)

* Split up AI filtering files

Create aggressive/moderate/permissive policies to allow administrators to choose their AI/LLM stance.

Aggressive policy matches existing default in Anubis.

Removes `Google-Extended` flag from `ai-robots-txt.yaml` as it doesn't exist in requests.

Rename `ai-robots-txt.yaml` to `ai-catchall.yaml` as the file is no longer a copy of the source repo/file.

* chore: spelling

* chore: fix embeds

* chore: fix data includes

* chore: fix file name typo

* chore: Ignore READMEs in configs

* chore(lib/policy/config): go tool goimports -w

Signed-off-by: Xe Iaso <me@xeiaso.net>

---------

Signed-off-by: Xe Iaso <me@xeiaso.net>
Co-authored-by: Xe Iaso <me@xeiaso.net>
This commit is contained in:
Corry Haines 2025-06-01 13:21:18 -07:00 committed by GitHub
parent 77e0bbbce9
commit de7dbfe6d6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
19 changed files with 107 additions and 18 deletions

View File

@ -18,7 +18,9 @@ blueskybot
boi boi
botnet botnet
BPort BPort
Brightbot
broked broked
Bytespider
cachebuster cachebuster
Caddyfile Caddyfile
caninetools caninetools
@ -41,6 +43,7 @@ cloudflare
confd confd
containerbuild containerbuild
coreutils coreutils
Cotoyogi
CRDs CRDs
crt crt
daemonizing daemonizing
@ -49,6 +52,7 @@ Debian
debrpm debrpm
decaymap decaymap
decompiling decompiling
Diffbot
discordapp discordapp
discordbot discordbot
distros distros
@ -66,11 +70,15 @@ everyones
evilbot evilbot
evilsite evilsite
expressionorlist expressionorlist
externalagent
externalfetcher
extldflags extldflags
facebookgo facebookgo
Factset
fastcgi fastcgi
fediverse fediverse
finfos finfos
Firecrawl
flagenv flagenv
Fordola Fordola
forgejo forgejo
@ -86,6 +94,7 @@ googlebot
govulncheck govulncheck
GPG GPG
GPT GPT
gptbot
grw grw
Hashcash Hashcash
hashrate hashrate
@ -97,8 +106,11 @@ hostable
htmx htmx
httpdebug httpdebug
hypertext hypertext
iaskspider
iat iat
ifm ifm
Imagesift
imgproxy
inp inp
iss iss
isset isset
@ -146,11 +158,15 @@ nginx
nobots nobots
NONINFRINGEMENT NONINFRINGEMENT
nosleep nosleep
OCOB
ogtags ogtags
omgili
omgilibot
onionservice onionservice
openai openai
openrc openrc
pag pag
Pangu
parseable parseable
passthrough passthrough
Patreon Patreon
@ -185,18 +201,22 @@ RUnlock
sas sas
sasl sasl
Scumm Scumm
searchbot
searx searx
sebest sebest
secretplans secretplans
selfsigned selfsigned
Semrush
setsebool setsebool
shellcheck shellcheck
Sidetrade
sitemap sitemap
sls sls
sni sni
Sourceware Sourceware
Spambot Spambot
sparkline sparkline
spyderbot
srv srv
stackoverflow stackoverflow
startprecmd startprecmd
@ -212,12 +232,15 @@ techarohq
templ templ
templruntime templruntime
testarea testarea
Tik
Timpibot
torproject torproject
traefik traefik
unixhttpd unixhttpd
unmarshal unmarshal
uvx uvx
Varis Varis
Velen
vendored vendored
vhosts vhosts
videotest videotest
@ -227,9 +250,11 @@ webmaster
webpage webpage
websecure websecure
websites websites
Webzio
wordpress wordpress
Workaround Workaround
workdir workdir
wpbot
xcaddy xcaddy
Xeact Xeact
xeiaso xeiaso

View File

@ -4,7 +4,7 @@
"import": "(data)/bots/_deny-pathological.yaml" "import": "(data)/bots/_deny-pathological.yaml"
}, },
{ {
"import": "(data)/bots/ai-robots-txt.yaml" "import": "(data)/meta/ai-block-aggressive.yaml"
}, },
{ {
"import": "(data)/crawlers/_allow-good.yaml" "import": "(data)/crawlers/_allow-good.yaml"

View File

@ -17,8 +17,12 @@ bots:
import: (data)/bots/_deny-pathological.yaml import: (data)/bots/_deny-pathological.yaml
- import: (data)/bots/aggressive-brazilian-scrapers.yaml - import: (data)/bots/aggressive-brazilian-scrapers.yaml
# Enforce https://github.com/ai-robots-txt/ai.robots.txt # Aggressively block AI/LLM related bots/agents by default
- import: (data)/bots/ai-robots-txt.yaml - import: (data)/meta/ai-block-aggressive.yaml
# Consider replacing the aggressive AI policy with more selective policies:
# - import: (data)/meta/ai-block-moderate.yaml
# - import: (data)/meta/ai-block-permissive.yaml
# Search engine crawlers to allow, defaults to: # Search engine crawlers to allow, defaults to:
# - Google (so they don't try to bypass Anubis) # - Google (so they don't try to bypass Anubis)

View File

@ -0,0 +1,11 @@
# Extensive list of AI-affiliated agents based on https://github.com/ai-robots-txt/ai.robots.txt
# Add new/undocumented agents here. Where documentation exists, consider moving to dedicated policy files.
# Notes on various agents:
# - Amazonbot: Well documented, but they refuse to state which agent collects training data.
# - anthropic-ai/Claude-Web: Undocumented by Anthropic. Possibly deprecated or hallucinations?
# - Perplexity*: Well documented, but they refuse to state which agent collects training data.
# Warning: May contain user agents that _must_ be blocked in robots.txt, or the opt-out will have no effect.
- name: "ai-catchall"
user_agent_regex: >-
AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|anthropic-ai|Brightbot 1.0|Bytespider|CCBot|Claude-Web|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|GoogleOther|GoogleOther-Image|GoogleOther-Video|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|NovaAct|omgili|omgilibot|Operator|PanguBot|Perplexity-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YouBot
action: DENY

View File

@ -1,6 +0,0 @@
# Warning: Contains user agents that _must_ be blocked in robots.txt, or the opt-out will have no effect.
# Note: Blocks human-directed/non-training user agents
- name: "ai-robots-txt"
user_agent_regex: >-
AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|anthropic-ai|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-SearchBot|Claude-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|MistralAI-User/1.0|NovaAct|OAI-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YouBot
action: DENY

8
data/clients/ai.yaml Normal file
View File

@ -0,0 +1,8 @@
# User agents that act on behalf of humans in AI tools, e.g. searching the web.
# Each entry should have a positive/ALLOW entry created as well, with further documentation.
# Exceptions:
# - Claude-User: No published IP allowlist
- name: "ai-clients"
user_agent_regex: >-
ChatGPT-User|Claude-User|MistralAI-User
action: DENY

View File

@ -0,0 +1,8 @@
# User agents that index exclusively for search in for AI systems.
# Each entry should have a positive/ALLOW entry created as well, with further documentation.
# Exceptions:
# - Claude-SearchBot: No published IP allowlist
- name: "ai-crawlers-search"
user_agent_regex: >-
OAI-SearchBot|Claude-SearchBot
action: DENY

View File

@ -0,0 +1,8 @@
# User agents that crawl for training AI/LLM systems
# Each entry should have a positive/ALLOW entry created as well, with further documentation.
# Exceptions:
# - ClaudeBot: No published IP allowlist
- name: "ai-crawlers-training"
user_agent_regex: >-
GPTBot|ClaudeBot
action: DENY

View File

@ -3,6 +3,6 @@ package data
import "embed" import "embed"
var ( var (
//go:embed botPolicies.yaml botPolicies.json all:apps all:bots all:clients all:common all:crawlers //go:embed botPolicies.yaml botPolicies.json all:apps all:bots all:clients all:common all:crawlers all:meta
BotPolicies embed.FS BotPolicies embed.FS
) )

5
data/meta/README.md Normal file
View File

@ -0,0 +1,5 @@
# meta policies
Contains policies that exclusively reference policies in _multiple_ other data folders.
Akin to "stances" that the administrator can take, with reference to various topics, such as AI/LLM systems.

View File

@ -0,0 +1,6 @@
# Blocks all AI/LLM associated user agents, regardless of purpose or human agency
# Warning: To completely block some AI/LLM training, such as with Google, you _must_ place flags in robots.txt.
- import: (data)/bots/ai-catchall.yaml
- import: (data)/clients/ai.yaml
- import: (data)/crawlers/ai-search.yaml
- import: (data)/crawlers/ai-training.yaml

View File

@ -0,0 +1,7 @@
# Blocks all AI/LLM bots used for training or unknown/undocumented purposes.
# Permits user agents with explicitly documented non-training use, and published IP allowlists.
- import: (data)/bots/ai-catchall.yaml
- import: (data)/crawlers/ai-training.yaml
- import: (data)/crawlers/openai-searchbot.yaml
- import: (data)/clients/openai-chatgpt-user.yaml
- import: (data)/clients/mistral-mistralai-user.yaml

View File

@ -0,0 +1,6 @@
# Permits all well documented AI/LLM user agents with published IP allowlists.
- import: (data)/bots/ai-catchall.yaml
- import: (data)/crawlers/openai-searchbot.yaml
- import: (data)/crawlers/openai-gptbot.yaml
- import: (data)/clients/openai-chatgpt-user.yaml
- import: (data)/clients/mistral-mistralai-user.yaml

View File

@ -41,6 +41,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Added `--version` flag. - Added `--version` flag.
- Added `anubis_proxied_requests_total` metric to count proxied requests. - Added `anubis_proxied_requests_total` metric to count proxied requests.
- Add `Applebot` as "good" web crawler - Add `Applebot` as "good" web crawler
- Reorganize AI/LLM crawler blocking into three separate stances, maintaining existing status quo as default.
- Split out AI/LLM user agent blocking policies, adding documentation for each.
## v1.18.0: Varis zos Galvus ## v1.18.0: Varis zos Galvus

View File

@ -14,7 +14,7 @@ EG:
{ {
"bots": [ "bots": [
{ {
"import": "(data)/bots/ai-robots-txt.yaml" "import": "(data)/bots/ai-catchall.yaml"
}, },
{ {
"import": "(data)/bots/cloudflare-workers.yaml" "import": "(data)/bots/cloudflare-workers.yaml"
@ -29,8 +29,8 @@ EG:
```yaml ```yaml
bots: bots:
# Pathological bots to deny # Pathological bots to deny
- # This correlates to data/bots/ai-robots-txt.yaml in the source tree - # This correlates to data/bots/ai-catchall.yaml in the source tree
import: (data)/bots/ai-robots-txt.yaml import: (data)/bots/ai-catchall.yaml
- import: (data)/bots/cloudflare-workers.yaml - import: (data)/bots/cloudflare-workers.yaml
``` ```
@ -46,7 +46,7 @@ Of note, a bot rule can either have inline bot configuration or import a bot con
{ {
"bots": [ "bots": [
{ {
"import": "(data)/bots/ai-robots-txt.yaml", "import": "(data)/bots/ai-catchall.yaml",
"name": "generic-browser", "name": "generic-browser",
"user_agent_regex": "Mozilla|Opera\n", "user_agent_regex": "Mozilla|Opera\n",
"action": "CHALLENGE" "action": "CHALLENGE"
@ -60,7 +60,7 @@ Of note, a bot rule can either have inline bot configuration or import a bot con
```yaml ```yaml
bots: bots:
- import: (data)/bots/ai-robots-txt.yaml - import: (data)/bots/ai-catchall.yaml
name: generic-browser name: generic-browser
user_agent_regex: > user_agent_regex: >
Mozilla|Opera Mozilla|Opera
@ -167,7 +167,7 @@ static
├── botPolicies.json ├── botPolicies.json
├── botPolicies.yaml ├── botPolicies.yaml
├── bots ├── bots
│ ├── ai-robots-txt.yaml │ ├── ai-catchall.yaml
│ ├── cloudflare-workers.yaml │ ├── cloudflare-workers.yaml
│ ├── headless-browsers.yaml │ ├── headless-browsers.yaml
│ └── us-ai-scraper.yaml │ └── us-ai-scraper.yaml

View File

@ -251,6 +251,7 @@ func TestImportStatement(t *testing.T) {
"bots", "bots",
"common", "common",
"crawlers", "crawlers",
"meta",
} { } {
if err := fs.WalkDir(data.BotPolicies, folderName, func(path string, d fs.DirEntry, err error) error { if err := fs.WalkDir(data.BotPolicies, folderName, func(path string, d fs.DirEntry, err error) error {
if err != nil { if err != nil {
@ -259,6 +260,9 @@ func TestImportStatement(t *testing.T) {
if d.IsDir() { if d.IsDir() {
return nil return nil
} }
if d.Name() == "README.md" {
return nil
}
tests = append(tests, testCase{ tests = append(tests, testCase{
name: "(data)/" + path, name: "(data)/" + path,

View File

@ -1,7 +1,7 @@
{ {
"bots": [ "bots": [
{ {
"import": "(data)/bots/ai-robots-txt.yaml", "import": "(data)/bots/ai-catchall.yaml",
"name": "generic-browser", "name": "generic-browser",
"user_agent_regex": "Mozilla|Opera\n", "user_agent_regex": "Mozilla|Opera\n",
"action": "CHALLENGE" "action": "CHALLENGE"

View File

@ -1,5 +1,5 @@
bots: bots:
- import: (data)/bots/ai-robots-txt.yaml - import: (data)/bots/ai-catchall.yaml
name: generic-browser name: generic-browser
user_agent_regex: > user_agent_regex: >
Mozilla|Opera Mozilla|Opera

View File

@ -35,6 +35,7 @@ $`npm run assets`;
$`cp -a data/clients ${doc}/data/clients`; $`cp -a data/clients ${doc}/data/clients`;
$`cp -a data/common ${doc}/data/common`; $`cp -a data/common ${doc}/data/common`;
$`cp -a data/crawlers ${doc}/data/crawlers`; $`cp -a data/crawlers ${doc}/data/crawlers`;
$`cp -a data/meta ${doc}/data/meta`;
}, },
})); }));
}); });