mirror of
https://github.com/TecharoHQ/anubis.git
synced 2025-08-03 17:59:24 -04:00
Split up AI filtering files (#592)
* Split up AI filtering files Create aggressive/moderate/permissive policies to allow administrators to choose their AI/LLM stance. Aggressive policy matches existing default in Anubis. Removes `Google-Extended` flag from `ai-robots-txt.yaml` as it doesn't exist in requests. Rename `ai-robots-txt.yaml` to `ai-catchall.yaml` as the file is no longer a copy of the source repo/file. * chore: spelling * chore: fix embeds * chore: fix data includes * chore: fix file name typo * chore: Ignore READMEs in configs * chore(lib/policy/config): go tool goimports -w Signed-off-by: Xe Iaso <me@xeiaso.net> --------- Signed-off-by: Xe Iaso <me@xeiaso.net> Co-authored-by: Xe Iaso <me@xeiaso.net>
This commit is contained in:
parent
77e0bbbce9
commit
de7dbfe6d6
25
.github/actions/spelling/expect.txt
vendored
25
.github/actions/spelling/expect.txt
vendored
@ -18,7 +18,9 @@ blueskybot
|
|||||||
boi
|
boi
|
||||||
botnet
|
botnet
|
||||||
BPort
|
BPort
|
||||||
|
Brightbot
|
||||||
broked
|
broked
|
||||||
|
Bytespider
|
||||||
cachebuster
|
cachebuster
|
||||||
Caddyfile
|
Caddyfile
|
||||||
caninetools
|
caninetools
|
||||||
@ -41,6 +43,7 @@ cloudflare
|
|||||||
confd
|
confd
|
||||||
containerbuild
|
containerbuild
|
||||||
coreutils
|
coreutils
|
||||||
|
Cotoyogi
|
||||||
CRDs
|
CRDs
|
||||||
crt
|
crt
|
||||||
daemonizing
|
daemonizing
|
||||||
@ -49,6 +52,7 @@ Debian
|
|||||||
debrpm
|
debrpm
|
||||||
decaymap
|
decaymap
|
||||||
decompiling
|
decompiling
|
||||||
|
Diffbot
|
||||||
discordapp
|
discordapp
|
||||||
discordbot
|
discordbot
|
||||||
distros
|
distros
|
||||||
@ -66,11 +70,15 @@ everyones
|
|||||||
evilbot
|
evilbot
|
||||||
evilsite
|
evilsite
|
||||||
expressionorlist
|
expressionorlist
|
||||||
|
externalagent
|
||||||
|
externalfetcher
|
||||||
extldflags
|
extldflags
|
||||||
facebookgo
|
facebookgo
|
||||||
|
Factset
|
||||||
fastcgi
|
fastcgi
|
||||||
fediverse
|
fediverse
|
||||||
finfos
|
finfos
|
||||||
|
Firecrawl
|
||||||
flagenv
|
flagenv
|
||||||
Fordola
|
Fordola
|
||||||
forgejo
|
forgejo
|
||||||
@ -86,6 +94,7 @@ googlebot
|
|||||||
govulncheck
|
govulncheck
|
||||||
GPG
|
GPG
|
||||||
GPT
|
GPT
|
||||||
|
gptbot
|
||||||
grw
|
grw
|
||||||
Hashcash
|
Hashcash
|
||||||
hashrate
|
hashrate
|
||||||
@ -97,8 +106,11 @@ hostable
|
|||||||
htmx
|
htmx
|
||||||
httpdebug
|
httpdebug
|
||||||
hypertext
|
hypertext
|
||||||
|
iaskspider
|
||||||
iat
|
iat
|
||||||
ifm
|
ifm
|
||||||
|
Imagesift
|
||||||
|
imgproxy
|
||||||
inp
|
inp
|
||||||
iss
|
iss
|
||||||
isset
|
isset
|
||||||
@ -146,11 +158,15 @@ nginx
|
|||||||
nobots
|
nobots
|
||||||
NONINFRINGEMENT
|
NONINFRINGEMENT
|
||||||
nosleep
|
nosleep
|
||||||
|
OCOB
|
||||||
ogtags
|
ogtags
|
||||||
|
omgili
|
||||||
|
omgilibot
|
||||||
onionservice
|
onionservice
|
||||||
openai
|
openai
|
||||||
openrc
|
openrc
|
||||||
pag
|
pag
|
||||||
|
Pangu
|
||||||
parseable
|
parseable
|
||||||
passthrough
|
passthrough
|
||||||
Patreon
|
Patreon
|
||||||
@ -185,18 +201,22 @@ RUnlock
|
|||||||
sas
|
sas
|
||||||
sasl
|
sasl
|
||||||
Scumm
|
Scumm
|
||||||
|
searchbot
|
||||||
searx
|
searx
|
||||||
sebest
|
sebest
|
||||||
secretplans
|
secretplans
|
||||||
selfsigned
|
selfsigned
|
||||||
|
Semrush
|
||||||
setsebool
|
setsebool
|
||||||
shellcheck
|
shellcheck
|
||||||
|
Sidetrade
|
||||||
sitemap
|
sitemap
|
||||||
sls
|
sls
|
||||||
sni
|
sni
|
||||||
Sourceware
|
Sourceware
|
||||||
Spambot
|
Spambot
|
||||||
sparkline
|
sparkline
|
||||||
|
spyderbot
|
||||||
srv
|
srv
|
||||||
stackoverflow
|
stackoverflow
|
||||||
startprecmd
|
startprecmd
|
||||||
@ -212,12 +232,15 @@ techarohq
|
|||||||
templ
|
templ
|
||||||
templruntime
|
templruntime
|
||||||
testarea
|
testarea
|
||||||
|
Tik
|
||||||
|
Timpibot
|
||||||
torproject
|
torproject
|
||||||
traefik
|
traefik
|
||||||
unixhttpd
|
unixhttpd
|
||||||
unmarshal
|
unmarshal
|
||||||
uvx
|
uvx
|
||||||
Varis
|
Varis
|
||||||
|
Velen
|
||||||
vendored
|
vendored
|
||||||
vhosts
|
vhosts
|
||||||
videotest
|
videotest
|
||||||
@ -227,9 +250,11 @@ webmaster
|
|||||||
webpage
|
webpage
|
||||||
websecure
|
websecure
|
||||||
websites
|
websites
|
||||||
|
Webzio
|
||||||
wordpress
|
wordpress
|
||||||
Workaround
|
Workaround
|
||||||
workdir
|
workdir
|
||||||
|
wpbot
|
||||||
xcaddy
|
xcaddy
|
||||||
Xeact
|
Xeact
|
||||||
xeiaso
|
xeiaso
|
||||||
|
@ -4,7 +4,7 @@
|
|||||||
"import": "(data)/bots/_deny-pathological.yaml"
|
"import": "(data)/bots/_deny-pathological.yaml"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"import": "(data)/bots/ai-robots-txt.yaml"
|
"import": "(data)/meta/ai-block-aggressive.yaml"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"import": "(data)/crawlers/_allow-good.yaml"
|
"import": "(data)/crawlers/_allow-good.yaml"
|
||||||
|
@ -17,8 +17,12 @@ bots:
|
|||||||
import: (data)/bots/_deny-pathological.yaml
|
import: (data)/bots/_deny-pathological.yaml
|
||||||
- import: (data)/bots/aggressive-brazilian-scrapers.yaml
|
- import: (data)/bots/aggressive-brazilian-scrapers.yaml
|
||||||
|
|
||||||
# Enforce https://github.com/ai-robots-txt/ai.robots.txt
|
# Aggressively block AI/LLM related bots/agents by default
|
||||||
- import: (data)/bots/ai-robots-txt.yaml
|
- import: (data)/meta/ai-block-aggressive.yaml
|
||||||
|
|
||||||
|
# Consider replacing the aggressive AI policy with more selective policies:
|
||||||
|
# - import: (data)/meta/ai-block-moderate.yaml
|
||||||
|
# - import: (data)/meta/ai-block-permissive.yaml
|
||||||
|
|
||||||
# Search engine crawlers to allow, defaults to:
|
# Search engine crawlers to allow, defaults to:
|
||||||
# - Google (so they don't try to bypass Anubis)
|
# - Google (so they don't try to bypass Anubis)
|
||||||
|
11
data/bots/ai-catchall.yaml
Normal file
11
data/bots/ai-catchall.yaml
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
# Extensive list of AI-affiliated agents based on https://github.com/ai-robots-txt/ai.robots.txt
|
||||||
|
# Add new/undocumented agents here. Where documentation exists, consider moving to dedicated policy files.
|
||||||
|
# Notes on various agents:
|
||||||
|
# - Amazonbot: Well documented, but they refuse to state which agent collects training data.
|
||||||
|
# - anthropic-ai/Claude-Web: Undocumented by Anthropic. Possibly deprecated or hallucinations?
|
||||||
|
# - Perplexity*: Well documented, but they refuse to state which agent collects training data.
|
||||||
|
# Warning: May contain user agents that _must_ be blocked in robots.txt, or the opt-out will have no effect.
|
||||||
|
- name: "ai-catchall"
|
||||||
|
user_agent_regex: >-
|
||||||
|
AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|anthropic-ai|Brightbot 1.0|Bytespider|CCBot|Claude-Web|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|GoogleOther|GoogleOther-Image|GoogleOther-Video|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|NovaAct|omgili|omgilibot|Operator|PanguBot|Perplexity-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YouBot
|
||||||
|
action: DENY
|
@ -1,6 +0,0 @@
|
|||||||
# Warning: Contains user agents that _must_ be blocked in robots.txt, or the opt-out will have no effect.
|
|
||||||
# Note: Blocks human-directed/non-training user agents
|
|
||||||
- name: "ai-robots-txt"
|
|
||||||
user_agent_regex: >-
|
|
||||||
AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|anthropic-ai|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-SearchBot|Claude-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|MistralAI-User/1.0|NovaAct|OAI-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YouBot
|
|
||||||
action: DENY
|
|
8
data/clients/ai.yaml
Normal file
8
data/clients/ai.yaml
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
# User agents that act on behalf of humans in AI tools, e.g. searching the web.
|
||||||
|
# Each entry should have a positive/ALLOW entry created as well, with further documentation.
|
||||||
|
# Exceptions:
|
||||||
|
# - Claude-User: No published IP allowlist
|
||||||
|
- name: "ai-clients"
|
||||||
|
user_agent_regex: >-
|
||||||
|
ChatGPT-User|Claude-User|MistralAI-User
|
||||||
|
action: DENY
|
8
data/crawlers/ai-search.yaml
Normal file
8
data/crawlers/ai-search.yaml
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
# User agents that index exclusively for search in for AI systems.
|
||||||
|
# Each entry should have a positive/ALLOW entry created as well, with further documentation.
|
||||||
|
# Exceptions:
|
||||||
|
# - Claude-SearchBot: No published IP allowlist
|
||||||
|
- name: "ai-crawlers-search"
|
||||||
|
user_agent_regex: >-
|
||||||
|
OAI-SearchBot|Claude-SearchBot
|
||||||
|
action: DENY
|
8
data/crawlers/ai-training.yaml
Normal file
8
data/crawlers/ai-training.yaml
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
# User agents that crawl for training AI/LLM systems
|
||||||
|
# Each entry should have a positive/ALLOW entry created as well, with further documentation.
|
||||||
|
# Exceptions:
|
||||||
|
# - ClaudeBot: No published IP allowlist
|
||||||
|
- name: "ai-crawlers-training"
|
||||||
|
user_agent_regex: >-
|
||||||
|
GPTBot|ClaudeBot
|
||||||
|
action: DENY
|
@ -3,6 +3,6 @@ package data
|
|||||||
import "embed"
|
import "embed"
|
||||||
|
|
||||||
var (
|
var (
|
||||||
//go:embed botPolicies.yaml botPolicies.json all:apps all:bots all:clients all:common all:crawlers
|
//go:embed botPolicies.yaml botPolicies.json all:apps all:bots all:clients all:common all:crawlers all:meta
|
||||||
BotPolicies embed.FS
|
BotPolicies embed.FS
|
||||||
)
|
)
|
||||||
|
5
data/meta/README.md
Normal file
5
data/meta/README.md
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
# meta policies
|
||||||
|
|
||||||
|
Contains policies that exclusively reference policies in _multiple_ other data folders.
|
||||||
|
|
||||||
|
Akin to "stances" that the administrator can take, with reference to various topics, such as AI/LLM systems.
|
6
data/meta/ai-block-aggressive.yaml
Normal file
6
data/meta/ai-block-aggressive.yaml
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
# Blocks all AI/LLM associated user agents, regardless of purpose or human agency
|
||||||
|
# Warning: To completely block some AI/LLM training, such as with Google, you _must_ place flags in robots.txt.
|
||||||
|
- import: (data)/bots/ai-catchall.yaml
|
||||||
|
- import: (data)/clients/ai.yaml
|
||||||
|
- import: (data)/crawlers/ai-search.yaml
|
||||||
|
- import: (data)/crawlers/ai-training.yaml
|
7
data/meta/ai-block-moderate.yaml
Normal file
7
data/meta/ai-block-moderate.yaml
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
# Blocks all AI/LLM bots used for training or unknown/undocumented purposes.
|
||||||
|
# Permits user agents with explicitly documented non-training use, and published IP allowlists.
|
||||||
|
- import: (data)/bots/ai-catchall.yaml
|
||||||
|
- import: (data)/crawlers/ai-training.yaml
|
||||||
|
- import: (data)/crawlers/openai-searchbot.yaml
|
||||||
|
- import: (data)/clients/openai-chatgpt-user.yaml
|
||||||
|
- import: (data)/clients/mistral-mistralai-user.yaml
|
6
data/meta/ai-block-permissive.yaml
Normal file
6
data/meta/ai-block-permissive.yaml
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
# Permits all well documented AI/LLM user agents with published IP allowlists.
|
||||||
|
- import: (data)/bots/ai-catchall.yaml
|
||||||
|
- import: (data)/crawlers/openai-searchbot.yaml
|
||||||
|
- import: (data)/crawlers/openai-gptbot.yaml
|
||||||
|
- import: (data)/clients/openai-chatgpt-user.yaml
|
||||||
|
- import: (data)/clients/mistral-mistralai-user.yaml
|
@ -41,6 +41,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
- Added `--version` flag.
|
- Added `--version` flag.
|
||||||
- Added `anubis_proxied_requests_total` metric to count proxied requests.
|
- Added `anubis_proxied_requests_total` metric to count proxied requests.
|
||||||
- Add `Applebot` as "good" web crawler
|
- Add `Applebot` as "good" web crawler
|
||||||
|
- Reorganize AI/LLM crawler blocking into three separate stances, maintaining existing status quo as default.
|
||||||
|
- Split out AI/LLM user agent blocking policies, adding documentation for each.
|
||||||
|
|
||||||
## v1.18.0: Varis zos Galvus
|
## v1.18.0: Varis zos Galvus
|
||||||
|
|
||||||
|
@ -14,7 +14,7 @@ EG:
|
|||||||
{
|
{
|
||||||
"bots": [
|
"bots": [
|
||||||
{
|
{
|
||||||
"import": "(data)/bots/ai-robots-txt.yaml"
|
"import": "(data)/bots/ai-catchall.yaml"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"import": "(data)/bots/cloudflare-workers.yaml"
|
"import": "(data)/bots/cloudflare-workers.yaml"
|
||||||
@ -29,8 +29,8 @@ EG:
|
|||||||
```yaml
|
```yaml
|
||||||
bots:
|
bots:
|
||||||
# Pathological bots to deny
|
# Pathological bots to deny
|
||||||
- # This correlates to data/bots/ai-robots-txt.yaml in the source tree
|
- # This correlates to data/bots/ai-catchall.yaml in the source tree
|
||||||
import: (data)/bots/ai-robots-txt.yaml
|
import: (data)/bots/ai-catchall.yaml
|
||||||
- import: (data)/bots/cloudflare-workers.yaml
|
- import: (data)/bots/cloudflare-workers.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -46,7 +46,7 @@ Of note, a bot rule can either have inline bot configuration or import a bot con
|
|||||||
{
|
{
|
||||||
"bots": [
|
"bots": [
|
||||||
{
|
{
|
||||||
"import": "(data)/bots/ai-robots-txt.yaml",
|
"import": "(data)/bots/ai-catchall.yaml",
|
||||||
"name": "generic-browser",
|
"name": "generic-browser",
|
||||||
"user_agent_regex": "Mozilla|Opera\n",
|
"user_agent_regex": "Mozilla|Opera\n",
|
||||||
"action": "CHALLENGE"
|
"action": "CHALLENGE"
|
||||||
@ -60,7 +60,7 @@ Of note, a bot rule can either have inline bot configuration or import a bot con
|
|||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
bots:
|
bots:
|
||||||
- import: (data)/bots/ai-robots-txt.yaml
|
- import: (data)/bots/ai-catchall.yaml
|
||||||
name: generic-browser
|
name: generic-browser
|
||||||
user_agent_regex: >
|
user_agent_regex: >
|
||||||
Mozilla|Opera
|
Mozilla|Opera
|
||||||
@ -167,7 +167,7 @@ static
|
|||||||
├── botPolicies.json
|
├── botPolicies.json
|
||||||
├── botPolicies.yaml
|
├── botPolicies.yaml
|
||||||
├── bots
|
├── bots
|
||||||
│ ├── ai-robots-txt.yaml
|
│ ├── ai-catchall.yaml
|
||||||
│ ├── cloudflare-workers.yaml
|
│ ├── cloudflare-workers.yaml
|
||||||
│ ├── headless-browsers.yaml
|
│ ├── headless-browsers.yaml
|
||||||
│ └── us-ai-scraper.yaml
|
│ └── us-ai-scraper.yaml
|
||||||
|
@ -251,6 +251,7 @@ func TestImportStatement(t *testing.T) {
|
|||||||
"bots",
|
"bots",
|
||||||
"common",
|
"common",
|
||||||
"crawlers",
|
"crawlers",
|
||||||
|
"meta",
|
||||||
} {
|
} {
|
||||||
if err := fs.WalkDir(data.BotPolicies, folderName, func(path string, d fs.DirEntry, err error) error {
|
if err := fs.WalkDir(data.BotPolicies, folderName, func(path string, d fs.DirEntry, err error) error {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -259,6 +260,9 @@ func TestImportStatement(t *testing.T) {
|
|||||||
if d.IsDir() {
|
if d.IsDir() {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
if d.Name() == "README.md" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
tests = append(tests, testCase{
|
tests = append(tests, testCase{
|
||||||
name: "(data)/" + path,
|
name: "(data)/" + path,
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
{
|
{
|
||||||
"bots": [
|
"bots": [
|
||||||
{
|
{
|
||||||
"import": "(data)/bots/ai-robots-txt.yaml",
|
"import": "(data)/bots/ai-catchall.yaml",
|
||||||
"name": "generic-browser",
|
"name": "generic-browser",
|
||||||
"user_agent_regex": "Mozilla|Opera\n",
|
"user_agent_regex": "Mozilla|Opera\n",
|
||||||
"action": "CHALLENGE"
|
"action": "CHALLENGE"
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
bots:
|
bots:
|
||||||
- import: (data)/bots/ai-robots-txt.yaml
|
- import: (data)/bots/ai-catchall.yaml
|
||||||
name: generic-browser
|
name: generic-browser
|
||||||
user_agent_regex: >
|
user_agent_regex: >
|
||||||
Mozilla|Opera
|
Mozilla|Opera
|
||||||
|
@ -35,6 +35,7 @@ $`npm run assets`;
|
|||||||
$`cp -a data/clients ${doc}/data/clients`;
|
$`cp -a data/clients ${doc}/data/clients`;
|
||||||
$`cp -a data/common ${doc}/data/common`;
|
$`cp -a data/common ${doc}/data/common`;
|
||||||
$`cp -a data/crawlers ${doc}/data/crawlers`;
|
$`cp -a data/crawlers ${doc}/data/crawlers`;
|
||||||
|
$`cp -a data/meta ${doc}/data/meta`;
|
||||||
},
|
},
|
||||||
}));
|
}));
|
||||||
});
|
});
|
||||||
|
Loading…
x
Reference in New Issue
Block a user