Split up AI filtering files (#592)

* Split up AI filtering files

Create aggressive/moderate/permissive policies to allow administrators to choose their AI/LLM stance.

Aggressive policy matches existing default in Anubis.

Removes `Google-Extended` flag from `ai-robots-txt.yaml` as it doesn't exist in requests.

Rename `ai-robots-txt.yaml` to `ai-catchall.yaml` as the file is no longer a copy of the source repo/file.

* chore: spelling

* chore: fix embeds

* chore: fix data includes

* chore: fix file name typo

* chore: Ignore READMEs in configs

* chore(lib/policy/config): go tool goimports -w

Signed-off-by: Xe Iaso <me@xeiaso.net>

---------

Signed-off-by: Xe Iaso <me@xeiaso.net>
Co-authored-by: Xe Iaso <me@xeiaso.net>
This commit is contained in:
Corry Haines 2025-06-01 13:21:18 -07:00 committed by GitHub
parent 77e0bbbce9
commit de7dbfe6d6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
19 changed files with 107 additions and 18 deletions

View File

@ -18,7 +18,9 @@ blueskybot
boi
botnet
BPort
Brightbot
broked
Bytespider
cachebuster
Caddyfile
caninetools
@ -41,6 +43,7 @@ cloudflare
confd
containerbuild
coreutils
Cotoyogi
CRDs
crt
daemonizing
@ -49,6 +52,7 @@ Debian
debrpm
decaymap
decompiling
Diffbot
discordapp
discordbot
distros
@ -66,11 +70,15 @@ everyones
evilbot
evilsite
expressionorlist
externalagent
externalfetcher
extldflags
facebookgo
Factset
fastcgi
fediverse
finfos
Firecrawl
flagenv
Fordola
forgejo
@ -86,6 +94,7 @@ googlebot
govulncheck
GPG
GPT
gptbot
grw
Hashcash
hashrate
@ -97,8 +106,11 @@ hostable
htmx
httpdebug
hypertext
iaskspider
iat
ifm
Imagesift
imgproxy
inp
iss
isset
@ -146,11 +158,15 @@ nginx
nobots
NONINFRINGEMENT
nosleep
OCOB
ogtags
omgili
omgilibot
onionservice
openai
openrc
pag
Pangu
parseable
passthrough
Patreon
@ -185,18 +201,22 @@ RUnlock
sas
sasl
Scumm
searchbot
searx
sebest
secretplans
selfsigned
Semrush
setsebool
shellcheck
Sidetrade
sitemap
sls
sni
Sourceware
Spambot
sparkline
spyderbot
srv
stackoverflow
startprecmd
@ -212,12 +232,15 @@ techarohq
templ
templruntime
testarea
Tik
Timpibot
torproject
traefik
unixhttpd
unmarshal
uvx
Varis
Velen
vendored
vhosts
videotest
@ -227,9 +250,11 @@ webmaster
webpage
websecure
websites
Webzio
wordpress
Workaround
workdir
wpbot
xcaddy
Xeact
xeiaso

View File

@ -4,7 +4,7 @@
"import": "(data)/bots/_deny-pathological.yaml"
},
{
"import": "(data)/bots/ai-robots-txt.yaml"
"import": "(data)/meta/ai-block-aggressive.yaml"
},
{
"import": "(data)/crawlers/_allow-good.yaml"

View File

@ -17,8 +17,12 @@ bots:
import: (data)/bots/_deny-pathological.yaml
- import: (data)/bots/aggressive-brazilian-scrapers.yaml
# Enforce https://github.com/ai-robots-txt/ai.robots.txt
- import: (data)/bots/ai-robots-txt.yaml
# Aggressively block AI/LLM related bots/agents by default
- import: (data)/meta/ai-block-aggressive.yaml
# Consider replacing the aggressive AI policy with more selective policies:
# - import: (data)/meta/ai-block-moderate.yaml
# - import: (data)/meta/ai-block-permissive.yaml
# Search engine crawlers to allow, defaults to:
# - Google (so they don't try to bypass Anubis)

View File

@ -0,0 +1,11 @@
# Extensive list of AI-affiliated agents based on https://github.com/ai-robots-txt/ai.robots.txt
# Add new/undocumented agents here. Where documentation exists, consider moving to dedicated policy files.
# Notes on various agents:
# - Amazonbot: Well documented, but they refuse to state which agent collects training data.
# - anthropic-ai/Claude-Web: Undocumented by Anthropic. Possibly deprecated or hallucinations?
# - Perplexity*: Well documented, but they refuse to state which agent collects training data.
# Warning: May contain user agents that _must_ be blocked in robots.txt, or the opt-out will have no effect.
- name: "ai-catchall"
user_agent_regex: >-
AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|anthropic-ai|Brightbot 1.0|Bytespider|CCBot|Claude-Web|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|GoogleOther|GoogleOther-Image|GoogleOther-Video|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|NovaAct|omgili|omgilibot|Operator|PanguBot|Perplexity-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YouBot
action: DENY

View File

@ -1,6 +0,0 @@
# Warning: Contains user agents that _must_ be blocked in robots.txt, or the opt-out will have no effect.
# Note: Blocks human-directed/non-training user agents
- name: "ai-robots-txt"
user_agent_regex: >-
AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|anthropic-ai|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-SearchBot|Claude-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|MistralAI-User/1.0|NovaAct|OAI-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YouBot
action: DENY

8
data/clients/ai.yaml Normal file
View File

@ -0,0 +1,8 @@
# User agents that act on behalf of humans in AI tools, e.g. searching the web.
# Each entry should have a positive/ALLOW entry created as well, with further documentation.
# Exceptions:
# - Claude-User: No published IP allowlist
- name: "ai-clients"
user_agent_regex: >-
ChatGPT-User|Claude-User|MistralAI-User
action: DENY

View File

@ -0,0 +1,8 @@
# User agents that index exclusively for search in for AI systems.
# Each entry should have a positive/ALLOW entry created as well, with further documentation.
# Exceptions:
# - Claude-SearchBot: No published IP allowlist
- name: "ai-crawlers-search"
user_agent_regex: >-
OAI-SearchBot|Claude-SearchBot
action: DENY

View File

@ -0,0 +1,8 @@
# User agents that crawl for training AI/LLM systems
# Each entry should have a positive/ALLOW entry created as well, with further documentation.
# Exceptions:
# - ClaudeBot: No published IP allowlist
- name: "ai-crawlers-training"
user_agent_regex: >-
GPTBot|ClaudeBot
action: DENY

View File

@ -3,6 +3,6 @@ package data
import "embed"
var (
//go:embed botPolicies.yaml botPolicies.json all:apps all:bots all:clients all:common all:crawlers
//go:embed botPolicies.yaml botPolicies.json all:apps all:bots all:clients all:common all:crawlers all:meta
BotPolicies embed.FS
)

5
data/meta/README.md Normal file
View File

@ -0,0 +1,5 @@
# meta policies
Contains policies that exclusively reference policies in _multiple_ other data folders.
Akin to "stances" that the administrator can take, with reference to various topics, such as AI/LLM systems.

View File

@ -0,0 +1,6 @@
# Blocks all AI/LLM associated user agents, regardless of purpose or human agency
# Warning: To completely block some AI/LLM training, such as with Google, you _must_ place flags in robots.txt.
- import: (data)/bots/ai-catchall.yaml
- import: (data)/clients/ai.yaml
- import: (data)/crawlers/ai-search.yaml
- import: (data)/crawlers/ai-training.yaml

View File

@ -0,0 +1,7 @@
# Blocks all AI/LLM bots used for training or unknown/undocumented purposes.
# Permits user agents with explicitly documented non-training use, and published IP allowlists.
- import: (data)/bots/ai-catchall.yaml
- import: (data)/crawlers/ai-training.yaml
- import: (data)/crawlers/openai-searchbot.yaml
- import: (data)/clients/openai-chatgpt-user.yaml
- import: (data)/clients/mistral-mistralai-user.yaml

View File

@ -0,0 +1,6 @@
# Permits all well documented AI/LLM user agents with published IP allowlists.
- import: (data)/bots/ai-catchall.yaml
- import: (data)/crawlers/openai-searchbot.yaml
- import: (data)/crawlers/openai-gptbot.yaml
- import: (data)/clients/openai-chatgpt-user.yaml
- import: (data)/clients/mistral-mistralai-user.yaml

View File

@ -41,6 +41,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Added `--version` flag.
- Added `anubis_proxied_requests_total` metric to count proxied requests.
- Add `Applebot` as "good" web crawler
- Reorganize AI/LLM crawler blocking into three separate stances, maintaining existing status quo as default.
- Split out AI/LLM user agent blocking policies, adding documentation for each.
## v1.18.0: Varis zos Galvus

View File

@ -14,7 +14,7 @@ EG:
{
"bots": [
{
"import": "(data)/bots/ai-robots-txt.yaml"
"import": "(data)/bots/ai-catchall.yaml"
},
{
"import": "(data)/bots/cloudflare-workers.yaml"
@ -29,8 +29,8 @@ EG:
```yaml
bots:
# Pathological bots to deny
- # This correlates to data/bots/ai-robots-txt.yaml in the source tree
import: (data)/bots/ai-robots-txt.yaml
- # This correlates to data/bots/ai-catchall.yaml in the source tree
import: (data)/bots/ai-catchall.yaml
- import: (data)/bots/cloudflare-workers.yaml
```
@ -46,7 +46,7 @@ Of note, a bot rule can either have inline bot configuration or import a bot con
{
"bots": [
{
"import": "(data)/bots/ai-robots-txt.yaml",
"import": "(data)/bots/ai-catchall.yaml",
"name": "generic-browser",
"user_agent_regex": "Mozilla|Opera\n",
"action": "CHALLENGE"
@ -60,7 +60,7 @@ Of note, a bot rule can either have inline bot configuration or import a bot con
```yaml
bots:
- import: (data)/bots/ai-robots-txt.yaml
- import: (data)/bots/ai-catchall.yaml
name: generic-browser
user_agent_regex: >
Mozilla|Opera
@ -167,7 +167,7 @@ static
├── botPolicies.json
├── botPolicies.yaml
├── bots
│ ├── ai-robots-txt.yaml
│ ├── ai-catchall.yaml
│ ├── cloudflare-workers.yaml
│ ├── headless-browsers.yaml
│ └── us-ai-scraper.yaml

View File

@ -251,6 +251,7 @@ func TestImportStatement(t *testing.T) {
"bots",
"common",
"crawlers",
"meta",
} {
if err := fs.WalkDir(data.BotPolicies, folderName, func(path string, d fs.DirEntry, err error) error {
if err != nil {
@ -259,6 +260,9 @@ func TestImportStatement(t *testing.T) {
if d.IsDir() {
return nil
}
if d.Name() == "README.md" {
return nil
}
tests = append(tests, testCase{
name: "(data)/" + path,

View File

@ -1,7 +1,7 @@
{
"bots": [
{
"import": "(data)/bots/ai-robots-txt.yaml",
"import": "(data)/bots/ai-catchall.yaml",
"name": "generic-browser",
"user_agent_regex": "Mozilla|Opera\n",
"action": "CHALLENGE"

View File

@ -1,5 +1,5 @@
bots:
- import: (data)/bots/ai-robots-txt.yaml
- import: (data)/bots/ai-catchall.yaml
name: generic-browser
user_agent_regex: >
Mozilla|Opera

View File

@ -35,6 +35,7 @@ $`npm run assets`;
$`cp -a data/clients ${doc}/data/clients`;
$`cp -a data/common ${doc}/data/common`;
$`cp -a data/crawlers ${doc}/data/crawlers`;
$`cp -a data/meta ${doc}/data/meta`;
},
}));
});