From 74e11505c6133ee1107811e81a0fd53e1d7876dd Mon Sep 17 00:00:00 2001 From: Xe Iaso Date: Wed, 23 Apr 2025 07:01:28 -0400 Subject: [PATCH] feat: enable loading config fragments (#321) * feat(config): support importing bot policy snippets This changes the grammar of the Anubis bot policy config to allow importing from internal shared rules or external rules on the filesystem. This lets you create a file at `/data/policies/block-evilbot.yaml` and then import it with: ```yaml bots: - import: /data/policies/block-evilbot.yaml ``` This also explodes the default policy file into a bunch of composable snippets. Thank you @Aibrew for your example gitea Atom / RSS feed rules! Signed-off-by: Xe Iaso * fix(data): update botPolicies.json to use imports Signed-off-by: Xe Iaso * fix(cmd/anubis): extract bot policies with --extract-resources This allows a user that doesn't have anything but the Anubis binary to figure out what the default configuration does. * docs(data/botPolices.yaml): document import syntax in-line Signed-off-by: Xe Iaso * fix(lib/policy): better test importing from JSON snippets Signed-off-by: Xe Iaso * docs(admin): Add import syntax documentation This documents the import syntax and is based on the block comment at the top of the default bot policy file. * docs(changelog): add note about importing snippets Signed-off-by: Xe Iaso * style(lib/policy/config): use an error value instead of an inline error Signed-off-by: Xe Iaso --------- Signed-off-by: Xe Iaso --- cmd/anubis/main.go | 6 +- data/apps/gitea-rss-feeds.yaml | 7 + data/botPolicies.json | 659 +---------------- data/botPolicies.yaml | 672 +----------------- data/bots/ai-robots-txt.yaml | 4 + data/bots/cloudflare-workers.yaml | 4 + data/bots/headless-browsers.yaml | 9 + data/bots/us-ai-scraper.yaml | 3 + data/common/allow-private-addresses.yaml | 15 + data/common/keep-internet-working.yaml | 10 + data/crawlers/bingbot.yaml | 34 + data/crawlers/duckduckbot.yaml | 275 +++++++ data/crawlers/googlebot.yaml | 263 +++++++ data/crawlers/internet-archive.yaml | 8 + data/crawlers/kagibot.yaml | 10 + data/crawlers/marginalia.yaml | 11 + data/crawlers/mojeekbot.yaml | 5 + data/crawlers/qwantbot.yaml | 5 + data/embed.go | 2 +- docs/docs/CHANGELOG.md | 1 + docs/docs/admin/configuration/import.mdx | 147 ++++ docs/docs/admin/policies.mdx | 1 + lib/policy/config/config.go | 170 ++++- lib/policy/config/config_test.go | 114 ++- .../config/testdata/bad/import_and_bot.json | 10 + .../config/testdata/bad/import_and_bot.yaml | 6 + .../testdata/bad/import_invalid_file.json | 7 + .../testdata/bad/import_invalid_file.yaml | 2 + .../testdata/good/import_filesystem.json | 7 + .../testdata/good/import_filesystem.yaml | 2 + .../good/import_keep_internet_working.json | 7 + .../good/import_keep_internet_working.yaml | 2 + lib/policy/config/testdata/hack-test.json | 9 + lib/policy/config/testdata/hack-test.yaml | 3 + lib/policy/policy.go | 13 +- lib/policy/testdata/hack-test.json | 9 + lib/policy/testdata/hack-test.yaml | 3 + 37 files changed, 1210 insertions(+), 1305 deletions(-) create mode 100644 data/apps/gitea-rss-feeds.yaml create mode 100644 data/bots/ai-robots-txt.yaml create mode 100644 data/bots/cloudflare-workers.yaml create mode 100644 data/bots/headless-browsers.yaml create mode 100644 data/bots/us-ai-scraper.yaml create mode 100644 data/common/allow-private-addresses.yaml create mode 100644 data/common/keep-internet-working.yaml create mode 100644 data/crawlers/bingbot.yaml create mode 100644 data/crawlers/duckduckbot.yaml create mode 100644 data/crawlers/googlebot.yaml create mode 100644 data/crawlers/internet-archive.yaml create mode 100644 data/crawlers/kagibot.yaml create mode 100644 data/crawlers/marginalia.yaml create mode 100644 data/crawlers/mojeekbot.yaml create mode 100644 data/crawlers/qwantbot.yaml create mode 100644 docs/docs/admin/configuration/import.mdx create mode 100644 lib/policy/config/testdata/bad/import_and_bot.json create mode 100644 lib/policy/config/testdata/bad/import_and_bot.yaml create mode 100644 lib/policy/config/testdata/bad/import_invalid_file.json create mode 100644 lib/policy/config/testdata/bad/import_invalid_file.yaml create mode 100644 lib/policy/config/testdata/good/import_filesystem.json create mode 100644 lib/policy/config/testdata/good/import_filesystem.yaml create mode 100644 lib/policy/config/testdata/good/import_keep_internet_working.json create mode 100644 lib/policy/config/testdata/good/import_keep_internet_working.yaml create mode 100644 lib/policy/config/testdata/hack-test.json create mode 100644 lib/policy/config/testdata/hack-test.yaml create mode 100644 lib/policy/testdata/hack-test.json create mode 100644 lib/policy/testdata/hack-test.yaml diff --git a/cmd/anubis/main.go b/cmd/anubis/main.go index b7375ea..f47acec 100644 --- a/cmd/anubis/main.go +++ b/cmd/anubis/main.go @@ -27,6 +27,7 @@ import ( "time" "github.com/TecharoHQ/anubis" + "github.com/TecharoHQ/anubis/data" "github.com/TecharoHQ/anubis/internal" libanubis "github.com/TecharoHQ/anubis/lib" botPolicy "github.com/TecharoHQ/anubis/lib/policy" @@ -184,6 +185,9 @@ func main() { } if *extractResources != "" { + if err := extractEmbedFS(data.BotPolicies, ".", *extractResources); err != nil { + log.Fatal(err) + } if err := extractEmbedFS(web.Static, "static", *extractResources); err != nil { log.Fatal(err) } @@ -347,7 +351,7 @@ func extractEmbedFS(fsys embed.FS, root string, destDir string) error { return err } - destPath := filepath.Join(destDir, relPath) + destPath := filepath.Join(destDir, root, relPath) if d.IsDir() { return os.MkdirAll(destPath, 0o700) diff --git a/data/apps/gitea-rss-feeds.yaml b/data/apps/gitea-rss-feeds.yaml new file mode 100644 index 0000000..7bd34ce --- /dev/null +++ b/data/apps/gitea-rss-feeds.yaml @@ -0,0 +1,7 @@ +# By Aibrew: https://github.com/TecharoHQ/anubis/discussions/261#discussioncomment-12821065 +- name: gitea-feed-atom + action: ALLOW + path_regex: ^/[.A-Za-z0-9_-]{1,256}?[./A-Za-z0-9_-]*\.atom$ +- name: gitea-feed-rss + action: ALLOW + path_regex: ^/[.A-Za-z0-9_-]{1,256}?[./A-Za-z0-9_-]*\.rss$ \ No newline at end of file diff --git a/data/botPolicies.json b/data/botPolicies.json index 72d38dc..dad04e8 100644 --- a/data/botPolicies.json +++ b/data/botPolicies.json @@ -1,678 +1,47 @@ { "bots": [ { - "name": "cloudflare-workers", - "headers_regex": { - "CF-Worker": ".*" - }, - "action": "DENY" + "import": "(data)/bots/ai-robots-txt.yaml" }, { - "name": "ai-robots-txt", - "user_agent_regex": "AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|Perplexity-User|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot", - "action": "DENY" + "import": "(data)/bots/cloudflare-workers.yaml" }, { - "name": "googlebot", - "user_agent_regex": "\\+http\\://www\\.google\\.com/bot\\.html", - "action": "ALLOW", - "remote_addresses": [ - "2001:4860:4801:10::/64", - "2001:4860:4801:11::/64", - "2001:4860:4801:12::/64", - "2001:4860:4801:13::/64", - "2001:4860:4801:14::/64", - "2001:4860:4801:15::/64", - "2001:4860:4801:16::/64", - "2001:4860:4801:17::/64", - "2001:4860:4801:18::/64", - "2001:4860:4801:19::/64", - "2001:4860:4801:1a::/64", - "2001:4860:4801:1b::/64", - "2001:4860:4801:1c::/64", - "2001:4860:4801:1d::/64", - "2001:4860:4801:1e::/64", - "2001:4860:4801:1f::/64", - "2001:4860:4801:20::/64", - "2001:4860:4801:21::/64", - "2001:4860:4801:22::/64", - "2001:4860:4801:23::/64", - "2001:4860:4801:24::/64", - "2001:4860:4801:25::/64", - "2001:4860:4801:26::/64", - "2001:4860:4801:27::/64", - "2001:4860:4801:28::/64", - "2001:4860:4801:29::/64", - "2001:4860:4801:2::/64", - "2001:4860:4801:2a::/64", - "2001:4860:4801:2b::/64", - "2001:4860:4801:2c::/64", - "2001:4860:4801:2d::/64", - "2001:4860:4801:2e::/64", - "2001:4860:4801:2f::/64", - "2001:4860:4801:31::/64", - "2001:4860:4801:32::/64", - "2001:4860:4801:33::/64", - "2001:4860:4801:34::/64", - "2001:4860:4801:35::/64", - "2001:4860:4801:36::/64", - "2001:4860:4801:37::/64", - "2001:4860:4801:38::/64", - "2001:4860:4801:39::/64", - "2001:4860:4801:3a::/64", - "2001:4860:4801:3b::/64", - "2001:4860:4801:3c::/64", - "2001:4860:4801:3d::/64", - "2001:4860:4801:3e::/64", - "2001:4860:4801:40::/64", - "2001:4860:4801:41::/64", - "2001:4860:4801:42::/64", - "2001:4860:4801:43::/64", - "2001:4860:4801:44::/64", - "2001:4860:4801:45::/64", - "2001:4860:4801:46::/64", - "2001:4860:4801:47::/64", - "2001:4860:4801:48::/64", - "2001:4860:4801:49::/64", - "2001:4860:4801:4a::/64", - "2001:4860:4801:4b::/64", - "2001:4860:4801:4c::/64", - "2001:4860:4801:50::/64", - "2001:4860:4801:51::/64", - "2001:4860:4801:52::/64", - "2001:4860:4801:53::/64", - "2001:4860:4801:54::/64", - "2001:4860:4801:55::/64", - "2001:4860:4801:56::/64", - "2001:4860:4801:60::/64", - "2001:4860:4801:61::/64", - "2001:4860:4801:62::/64", - "2001:4860:4801:63::/64", - "2001:4860:4801:64::/64", - "2001:4860:4801:65::/64", - "2001:4860:4801:66::/64", - "2001:4860:4801:67::/64", - "2001:4860:4801:68::/64", - "2001:4860:4801:69::/64", - "2001:4860:4801:6a::/64", - "2001:4860:4801:6b::/64", - "2001:4860:4801:6c::/64", - "2001:4860:4801:6d::/64", - "2001:4860:4801:6e::/64", - "2001:4860:4801:6f::/64", - "2001:4860:4801:70::/64", - "2001:4860:4801:71::/64", - "2001:4860:4801:72::/64", - "2001:4860:4801:73::/64", - "2001:4860:4801:74::/64", - "2001:4860:4801:75::/64", - "2001:4860:4801:76::/64", - "2001:4860:4801:77::/64", - "2001:4860:4801:78::/64", - "2001:4860:4801:79::/64", - "2001:4860:4801:80::/64", - "2001:4860:4801:81::/64", - "2001:4860:4801:82::/64", - "2001:4860:4801:83::/64", - "2001:4860:4801:84::/64", - "2001:4860:4801:85::/64", - "2001:4860:4801:86::/64", - "2001:4860:4801:87::/64", - "2001:4860:4801:88::/64", - "2001:4860:4801:90::/64", - "2001:4860:4801:91::/64", - "2001:4860:4801:92::/64", - "2001:4860:4801:93::/64", - "2001:4860:4801:94::/64", - "2001:4860:4801:95::/64", - "2001:4860:4801:96::/64", - "2001:4860:4801:a0::/64", - "2001:4860:4801:a1::/64", - "2001:4860:4801:a2::/64", - "2001:4860:4801:a3::/64", - "2001:4860:4801:a4::/64", - "2001:4860:4801:a5::/64", - "2001:4860:4801:c::/64", - "2001:4860:4801:f::/64", - "192.178.5.0/27", - "192.178.6.0/27", - "192.178.6.128/27", - "192.178.6.160/27", - "192.178.6.192/27", - "192.178.6.32/27", - "192.178.6.64/27", - "192.178.6.96/27", - "34.100.182.96/28", - "34.101.50.144/28", - "34.118.254.0/28", - "34.118.66.0/28", - "34.126.178.96/28", - "34.146.150.144/28", - "34.147.110.144/28", - "34.151.74.144/28", - "34.152.50.64/28", - "34.154.114.144/28", - "34.155.98.32/28", - "34.165.18.176/28", - "34.175.160.64/28", - "34.176.130.16/28", - "34.22.85.0/27", - "34.64.82.64/28", - "34.65.242.112/28", - "34.80.50.80/28", - "34.88.194.0/28", - "34.89.10.80/28", - "34.89.198.80/28", - "34.96.162.48/28", - "35.247.243.240/28", - "66.249.64.0/27", - "66.249.64.128/27", - "66.249.64.160/27", - "66.249.64.224/27", - "66.249.64.32/27", - "66.249.64.64/27", - "66.249.64.96/27", - "66.249.65.0/27", - "66.249.65.128/27", - "66.249.65.160/27", - "66.249.65.192/27", - "66.249.65.224/27", - "66.249.65.32/27", - "66.249.65.64/27", - "66.249.65.96/27", - "66.249.66.0/27", - "66.249.66.128/27", - "66.249.66.160/27", - "66.249.66.192/27", - "66.249.66.224/27", - "66.249.66.32/27", - "66.249.66.64/27", - "66.249.66.96/27", - "66.249.68.0/27", - "66.249.68.128/27", - "66.249.68.32/27", - "66.249.68.64/27", - "66.249.68.96/27", - "66.249.69.0/27", - "66.249.69.128/27", - "66.249.69.160/27", - "66.249.69.192/27", - "66.249.69.224/27", - "66.249.69.32/27", - "66.249.69.64/27", - "66.249.69.96/27", - "66.249.70.0/27", - "66.249.70.128/27", - "66.249.70.160/27", - "66.249.70.192/27", - "66.249.70.224/27", - "66.249.70.32/27", - "66.249.70.64/27", - "66.249.70.96/27", - "66.249.71.0/27", - "66.249.71.128/27", - "66.249.71.160/27", - "66.249.71.192/27", - "66.249.71.224/27", - "66.249.71.32/27", - "66.249.71.64/27", - "66.249.71.96/27", - "66.249.72.0/27", - "66.249.72.128/27", - "66.249.72.160/27", - "66.249.72.192/27", - "66.249.72.224/27", - "66.249.72.32/27", - "66.249.72.64/27", - "66.249.72.96/27", - "66.249.73.0/27", - "66.249.73.128/27", - "66.249.73.160/27", - "66.249.73.192/27", - "66.249.73.224/27", - "66.249.73.32/27", - "66.249.73.64/27", - "66.249.73.96/27", - "66.249.74.0/27", - "66.249.74.128/27", - "66.249.74.160/27", - "66.249.74.192/27", - "66.249.74.32/27", - "66.249.74.64/27", - "66.249.74.96/27", - "66.249.75.0/27", - "66.249.75.128/27", - "66.249.75.160/27", - "66.249.75.192/27", - "66.249.75.224/27", - "66.249.75.32/27", - "66.249.75.64/27", - "66.249.75.96/27", - "66.249.76.0/27", - "66.249.76.128/27", - "66.249.76.160/27", - "66.249.76.192/27", - "66.249.76.224/27", - "66.249.76.32/27", - "66.249.76.64/27", - "66.249.76.96/27", - "66.249.77.0/27", - "66.249.77.128/27", - "66.249.77.160/27", - "66.249.77.192/27", - "66.249.77.224/27", - "66.249.77.32/27", - "66.249.77.64/27", - "66.249.77.96/27", - "66.249.78.0/27", - "66.249.78.32/27", - "66.249.79.0/27", - "66.249.79.128/27", - "66.249.79.160/27", - "66.249.79.192/27", - "66.249.79.224/27", - "66.249.79.32/27", - "66.249.79.64/27", - "66.249.79.96/27" - ] + "import": "(data)/bots/headless-browsers.yaml" }, { - "name": "bingbot", - "user_agent_regex": "\\+http\\://www\\.bing\\.com/bingbot\\.htm", - "action": "ALLOW", - "remote_addresses": [ - "157.55.39.0/24", - "207.46.13.0/24", - "40.77.167.0/24", - "13.66.139.0/24", - "13.66.144.0/24", - "52.167.144.0/24", - "13.67.10.16/28", - "13.69.66.240/28", - "13.71.172.224/28", - "139.217.52.0/28", - "191.233.204.224/28", - "20.36.108.32/28", - "20.43.120.16/28", - "40.79.131.208/28", - "40.79.186.176/28", - "52.231.148.0/28", - "20.79.107.240/28", - "51.105.67.0/28", - "20.125.163.80/28", - "40.77.188.0/22", - "65.55.210.0/24", - "199.30.24.0/23", - "40.77.202.0/24", - "40.77.139.0/25", - "20.74.197.0/28", - "20.15.133.160/27", - "40.77.177.0/24", - "40.77.178.0/23" - ] + "import": "(data)/bots/us-ai-scraper.yaml" }, { - "name": "duckduckbot", - "user_agent_regex": "\\+http\\://duckduckgo\\.com/duckduckbot\\.html", - "action": "ALLOW", - "remote_addresses": [ - "57.152.72.128/32", - "51.8.253.152/32", - "40.80.242.63/32", - "20.12.141.99/32", - "20.49.136.28/32", - "51.116.131.221/32", - "51.107.40.209/32", - "20.40.133.240/32", - "20.50.168.91/32", - "51.120.48.122/32", - "20.193.45.113/32", - "40.76.173.151/32", - "40.76.163.7/32", - "20.185.79.47/32", - "52.142.26.175/32", - "20.185.79.15/32", - "52.142.24.149/32", - "40.76.162.208/32", - "40.76.163.23/32", - "40.76.162.191/32", - "40.76.162.247/32", - "40.88.21.235/32", - "20.191.45.212/32", - "52.146.59.12/32", - "52.146.59.156/32", - "52.146.59.154/32", - "52.146.58.236/32", - "20.62.224.44/32", - "51.104.180.53/32", - "51.104.180.47/32", - "51.104.180.26/32", - "51.104.146.225/32", - "51.104.146.235/32", - "20.73.202.147/32", - "20.73.132.240/32", - "20.71.12.143/32", - "20.56.197.58/32", - "20.56.197.63/32", - "20.43.150.93/32", - "20.43.150.85/32", - "20.44.222.1/32", - "40.89.243.175/32", - "13.89.106.77/32", - "52.143.242.6/32", - "52.143.241.111/32", - "52.154.60.82/32", - "20.197.209.11/32", - "20.197.209.27/32", - "20.226.133.105/32", - "191.234.216.4/32", - "191.234.216.178/32", - "20.53.92.211/32", - "20.53.91.2/32", - "20.207.99.197/32", - "20.207.97.190/32", - "40.81.250.205/32", - "40.64.106.11/32", - "40.64.105.247/32", - "20.72.242.93/32", - "20.99.255.235/32", - "20.113.3.121/32", - "52.224.16.221/32", - "52.224.21.53/32", - "52.224.20.204/32", - "52.224.21.19/32", - "52.224.20.249/32", - "52.224.20.203/32", - "52.224.20.190/32", - "52.224.16.229/32", - "52.224.21.20/32", - "52.146.63.80/32", - "52.224.20.227/32", - "52.224.20.193/32", - "52.190.37.160/32", - "52.224.21.23/32", - "52.224.20.223/32", - "52.224.20.181/32", - "52.224.21.49/32", - "52.224.21.55/32", - "52.224.21.61/32", - "52.224.19.152/32", - "52.224.20.186/32", - "52.224.21.27/32", - "52.224.21.51/32", - "52.224.20.174/32", - "52.224.21.4/32", - "51.104.164.109/32", - "51.104.167.71/32", - "51.104.160.177/32", - "51.104.162.149/32", - "51.104.167.95/32", - "51.104.167.54/32", - "51.104.166.111/32", - "51.104.167.88/32", - "51.104.161.32/32", - "51.104.163.250/32", - "51.104.164.189/32", - "51.104.167.19/32", - "51.104.160.167/32", - "51.104.167.110/32", - "20.191.44.119/32", - "51.104.167.104/32", - "20.191.44.234/32", - "51.104.164.215/32", - "51.104.167.52/32", - "20.191.44.22/32", - "51.104.167.87/32", - "51.104.167.96/32", - "20.191.44.16/32", - "51.104.167.61/32", - "51.104.164.147/32", - "20.50.48.159/32", - "40.114.182.172/32", - "20.50.50.130/32", - "20.50.50.163/32", - "20.50.50.46/32", - "40.114.182.153/32", - "20.50.50.118/32", - "20.50.49.55/32", - "20.50.49.25/32", - "40.114.183.251/32", - "20.50.50.123/32", - "20.50.49.237/32", - "20.50.48.192/32", - "20.50.50.134/32", - "51.138.90.233/32", - "40.114.183.196/32", - "20.50.50.146/32", - "40.114.183.88/32", - "20.50.50.145/32", - "20.50.50.121/32", - "20.50.49.40/32", - "51.138.90.206/32", - "40.114.182.45/32", - "51.138.90.161/32", - "20.50.49.0/32", - "40.119.232.215/32", - "104.43.55.167/32", - "40.119.232.251/32", - "40.119.232.50/32", - "40.119.232.146/32", - "40.119.232.218/32", - "104.43.54.127/32", - "104.43.55.117/32", - "104.43.55.116/32", - "104.43.55.166/32", - "52.154.169.50/32", - "52.154.171.70/32", - "52.154.170.229/32", - "52.154.170.113/32", - "52.154.171.44/32", - "52.154.172.2/32", - "52.143.244.81/32", - "52.154.171.87/32", - "52.154.171.250/32", - "52.154.170.28/32", - "52.154.170.122/32", - "52.143.243.117/32", - "52.143.247.235/32", - "52.154.171.235/32", - "52.154.171.196/32", - "52.154.171.0/32", - "52.154.170.243/32", - "52.154.170.26/32", - "52.154.169.200/32", - "52.154.170.96/32", - "52.154.170.88/32", - "52.154.171.150/32", - "52.154.171.205/32", - "52.154.170.117/32", - "52.154.170.209/32", - "191.235.202.48/32", - "191.233.3.202/32", - "191.235.201.214/32", - "191.233.3.197/32", - "191.235.202.38/32", - "20.53.78.144/32", - "20.193.24.10/32", - "20.53.78.236/32", - "20.53.78.138/32", - "20.53.78.123/32", - "20.53.78.106/32", - "20.193.27.215/32", - "20.193.25.197/32", - "20.193.12.126/32", - "20.193.24.251/32", - "20.204.242.101/32", - "20.207.72.113/32", - "20.204.242.19/32", - "20.219.45.67/32", - "20.207.72.11/32", - "20.219.45.190/32", - "20.204.243.55/32", - "20.204.241.148/32", - "20.207.72.110/32", - "20.204.240.172/32", - "20.207.72.21/32", - "20.204.246.81/32", - "20.207.107.181/32", - "20.204.246.254/32", - "20.219.43.246/32", - "52.149.25.43/32", - "52.149.61.51/32", - "52.149.58.139/32", - "52.149.60.38/32", - "52.148.165.38/32", - "52.143.95.162/32", - "52.149.56.151/32", - "52.149.30.45/32", - "52.149.58.173/32", - "52.143.95.204/32", - "52.149.28.83/32", - "52.149.58.69/32", - "52.148.161.87/32", - "52.149.58.27/32", - "52.149.28.18/32", - "20.79.226.26/32", - "20.79.239.66/32", - "20.79.238.198/32", - "20.113.14.159/32", - "20.75.144.152/32", - "20.43.172.120/32", - "20.53.134.160/32", - "20.201.15.208/32", - "20.93.28.24/32", - "20.61.34.40/32", - "52.242.224.168/32", - "20.80.129.80/32", - "20.195.108.47/32", - "4.195.133.120/32", - "4.228.76.163/32", - "4.182.131.108/32", - "4.209.224.56/32", - "108.141.83.74/32", - "4.213.46.14/32", - "172.169.17.165/32", - "51.8.71.117/32", - "20.3.1.178/32", - "52.149.56.151/32", - "52.149.30.45/32", - "52.149.58.173/32", - "52.143.95.204/32", - "52.149.28.83/32", - "52.149.58.69/32", - "52.148.161.87/32", - "52.149.58.27/32", - "52.149.28.18/32", - "20.79.226.26/32", - "20.79.239.66/32", - "20.79.238.198/32", - "20.113.14.159/32", - "20.75.144.152/32", - "20.43.172.120/32", - "20.53.134.160/32", - "20.201.15.208/32", - "20.93.28.24/32", - "20.61.34.40/32", - "52.242.224.168/32", - "20.80.129.80/32", - "20.195.108.47/32", - "4.195.133.120/32", - "4.228.76.163/32", - "4.182.131.108/32", - "4.209.224.56/32", - "108.141.83.74/32", - "4.213.46.14/32", - "172.169.17.165/32", - "51.8.71.117/32", - "20.3.1.178/32" - ] + "import": "(data)/crawlers/googlebot.yaml" }, { - "name": "qwantbot", - "user_agent_regex": "\\+https\\://help\\.qwant\\.com/bot/", - "action": "ALLOW", - "remote_addresses": [ - "91.242.162.0/24" - ] + "import": "(data)/crawlers/bingbot.yaml" }, { - "name": "internet-archive", - "action": "ALLOW", - "remote_addresses": [ - "207.241.224.0/20", - "208.70.24.0/21", - "2620:0:9c0::/48" - ] + "import": "(data)/crawlers/duckduckbot.yaml" }, { - "name": "kagibot", - "user_agent_regex": "\\+https\\://kagi\\.com/bot", - "action": "ALLOW", - "remote_addresses": [ - "216.18.205.234/32", - "35.212.27.76/32", - "104.254.65.50/32", - "209.151.156.194/32" - ] + "import": "(data)/crawlers/qwantbot.yaml" }, { - "name": "marginalia", - "user_agent_regex": "search\\.marginalia\\.nu", - "action": "ALLOW", - "remote_addresses": [ - "193.183.0.162/31", - "193.183.0.164/30", - "193.183.0.168/30", - "193.183.0.172/31", - "193.183.0.174/32" - ] + "import": "(data)/crawlers/internet-archive.yaml" }, { - "name": "mojeekbot", - "user_agent_regex": "http\\://www\\.mojeek\\.com/bot\\.html", - "action": "ALLOW", - "remote_addresses": [ - "5.102.173.71/32" - ] + "import": "(data)/crawlers/kagibot.yaml" }, { - "name": "us-artificial-intelligence-scraper", - "user_agent_regex": "\\+https\\://github\\.com/US-Artificial-Intelligence/scraper", - "action": "DENY" + "import": "(data)/crawlers/marginalia.yaml" }, { - "name": "well-known", - "path_regex": "^/.well-known/.*$", - "action": "ALLOW" + "import": "(data)/crawlers/mojeekbot.yaml" }, { - "name": "favicon", - "path_regex": "^/favicon.ico$", - "action": "ALLOW" - }, - { - "name": "robots-txt", - "path_regex": "^/robots.txt$", - "action": "ALLOW" - }, - { - "name": "lightpanda", - "user_agent_regex": "^Lightpanda/.*$", - "action": "DENY" - }, - { - "name": "headless-chrome", - "user_agent_regex": "HeadlessChrome", - "action": "DENY" - }, - { - "name": "headless-chromium", - "user_agent_regex": "HeadlessChromium", - "action": "DENY" + "import": "(data)/common/keep-internet-working.yaml" }, { "name": "generic-browser", - "user_agent_regex": "Mozilla|Opera", + "user_agent_regex": "Mozilla|Opera\n", "action": "CHALLENGE" } ], diff --git a/data/botPolicies.yaml b/data/botPolicies.yaml index cb4715a..585be15 100644 --- a/data/botPolicies.yaml +++ b/data/botPolicies.yaml @@ -1,651 +1,38 @@ +## Anubis has the ability to let you import snippets of configuration into the main +## configuration file. This allows you to break up your config into smaller parts +## that get logically assembled into one big file. +## +## Of note, a bot rule can either have inline bot configuration or import a +## bot config snippet. You cannot do both in a single bot rule. +## +## Import paths can either be prefixed with (data) to import from the common/shared +## rules in the data folder in the Anubis source tree or will point to absolute/relative +## paths in your filesystem. If you don't have access to the Anubis source tree, check +## /usr/share/docs/anubis/data or in the tarball you extracted Anubis from. + bots: # Pathological bots to deny -- name: us-artificial-intelligence-scraper - user_agent_regex: \+https\://github\.com/US-Artificial-Intelligence/scraper - action: DENY -- name: lightpanda - user_agent_regex: ^LightPanda/.*$ - action: DENY -- name: headless-chrome - user_agent_regex: HeadlessChrome - action: DENY -- name: headless-chromium - user_agent_regex: HeadlessChromium - action: DENY -- name: "ai-robots-txt" - user_agent_regex: > - AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|Perplexity-User|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot - action: DENY -- name: cloudflare-workers - headers_regex: - CF-Worker: .* - action: DENY +- # This correlates to data/bots/ai-robots-txt.yaml in the source tree + import: (data)/bots/ai-robots-txt.yaml +- import: (data)/bots/cloudflare-workers.yaml +- import: (data)/bots/headless-browsers.yaml +- import: (data)/bots/us-ai-scraper.yaml -# search engines to allow -- name: googlebot - user_agent_regex: \+http\://www\.google\.com/bot\.html - action: ALLOW - # https://developers.google.com/static/search/apis/ipranges/googlebot.json - remote_addresses: [ - "2001:4860:4801:10::/64", - "2001:4860:4801:11::/64", - "2001:4860:4801:12::/64", - "2001:4860:4801:13::/64", - "2001:4860:4801:14::/64", - "2001:4860:4801:15::/64", - "2001:4860:4801:16::/64", - "2001:4860:4801:17::/64", - "2001:4860:4801:18::/64", - "2001:4860:4801:19::/64", - "2001:4860:4801:1a::/64", - "2001:4860:4801:1b::/64", - "2001:4860:4801:1c::/64", - "2001:4860:4801:1d::/64", - "2001:4860:4801:1e::/64", - "2001:4860:4801:1f::/64", - "2001:4860:4801:20::/64", - "2001:4860:4801:21::/64", - "2001:4860:4801:22::/64", - "2001:4860:4801:23::/64", - "2001:4860:4801:24::/64", - "2001:4860:4801:25::/64", - "2001:4860:4801:26::/64", - "2001:4860:4801:27::/64", - "2001:4860:4801:28::/64", - "2001:4860:4801:29::/64", - "2001:4860:4801:2::/64", - "2001:4860:4801:2a::/64", - "2001:4860:4801:2b::/64", - "2001:4860:4801:2c::/64", - "2001:4860:4801:2d::/64", - "2001:4860:4801:2e::/64", - "2001:4860:4801:2f::/64", - "2001:4860:4801:31::/64", - "2001:4860:4801:32::/64", - "2001:4860:4801:33::/64", - "2001:4860:4801:34::/64", - "2001:4860:4801:35::/64", - "2001:4860:4801:36::/64", - "2001:4860:4801:37::/64", - "2001:4860:4801:38::/64", - "2001:4860:4801:39::/64", - "2001:4860:4801:3a::/64", - "2001:4860:4801:3b::/64", - "2001:4860:4801:3c::/64", - "2001:4860:4801:3d::/64", - "2001:4860:4801:3e::/64", - "2001:4860:4801:40::/64", - "2001:4860:4801:41::/64", - "2001:4860:4801:42::/64", - "2001:4860:4801:43::/64", - "2001:4860:4801:44::/64", - "2001:4860:4801:45::/64", - "2001:4860:4801:46::/64", - "2001:4860:4801:47::/64", - "2001:4860:4801:48::/64", - "2001:4860:4801:49::/64", - "2001:4860:4801:4a::/64", - "2001:4860:4801:4b::/64", - "2001:4860:4801:4c::/64", - "2001:4860:4801:50::/64", - "2001:4860:4801:51::/64", - "2001:4860:4801:52::/64", - "2001:4860:4801:53::/64", - "2001:4860:4801:54::/64", - "2001:4860:4801:55::/64", - "2001:4860:4801:56::/64", - "2001:4860:4801:60::/64", - "2001:4860:4801:61::/64", - "2001:4860:4801:62::/64", - "2001:4860:4801:63::/64", - "2001:4860:4801:64::/64", - "2001:4860:4801:65::/64", - "2001:4860:4801:66::/64", - "2001:4860:4801:67::/64", - "2001:4860:4801:68::/64", - "2001:4860:4801:69::/64", - "2001:4860:4801:6a::/64", - "2001:4860:4801:6b::/64", - "2001:4860:4801:6c::/64", - "2001:4860:4801:6d::/64", - "2001:4860:4801:6e::/64", - "2001:4860:4801:6f::/64", - "2001:4860:4801:70::/64", - "2001:4860:4801:71::/64", - "2001:4860:4801:72::/64", - "2001:4860:4801:73::/64", - "2001:4860:4801:74::/64", - "2001:4860:4801:75::/64", - "2001:4860:4801:76::/64", - "2001:4860:4801:77::/64", - "2001:4860:4801:78::/64", - "2001:4860:4801:79::/64", - "2001:4860:4801:80::/64", - "2001:4860:4801:81::/64", - "2001:4860:4801:82::/64", - "2001:4860:4801:83::/64", - "2001:4860:4801:84::/64", - "2001:4860:4801:85::/64", - "2001:4860:4801:86::/64", - "2001:4860:4801:87::/64", - "2001:4860:4801:88::/64", - "2001:4860:4801:90::/64", - "2001:4860:4801:91::/64", - "2001:4860:4801:92::/64", - "2001:4860:4801:93::/64", - "2001:4860:4801:94::/64", - "2001:4860:4801:95::/64", - "2001:4860:4801:96::/64", - "2001:4860:4801:a0::/64", - "2001:4860:4801:a1::/64", - "2001:4860:4801:a2::/64", - "2001:4860:4801:a3::/64", - "2001:4860:4801:a4::/64", - "2001:4860:4801:a5::/64", - "2001:4860:4801:c::/64", - "2001:4860:4801:f::/64", - "192.178.5.0/27", - "192.178.6.0/27", - "192.178.6.128/27", - "192.178.6.160/27", - "192.178.6.192/27", - "192.178.6.32/27", - "192.178.6.64/27", - "192.178.6.96/27", - "34.100.182.96/28", - "34.101.50.144/28", - "34.118.254.0/28", - "34.118.66.0/28", - "34.126.178.96/28", - "34.146.150.144/28", - "34.147.110.144/28", - "34.151.74.144/28", - "34.152.50.64/28", - "34.154.114.144/28", - "34.155.98.32/28", - "34.165.18.176/28", - "34.175.160.64/28", - "34.176.130.16/28", - "34.22.85.0/27", - "34.64.82.64/28", - "34.65.242.112/28", - "34.80.50.80/28", - "34.88.194.0/28", - "34.89.10.80/28", - "34.89.198.80/28", - "34.96.162.48/28", - "35.247.243.240/28", - "66.249.64.0/27", - "66.249.64.128/27", - "66.249.64.160/27", - "66.249.64.224/27", - "66.249.64.32/27", - "66.249.64.64/27", - "66.249.64.96/27", - "66.249.65.0/27", - "66.249.65.128/27", - "66.249.65.160/27", - "66.249.65.192/27", - "66.249.65.224/27", - "66.249.65.32/27", - "66.249.65.64/27", - "66.249.65.96/27", - "66.249.66.0/27", - "66.249.66.128/27", - "66.249.66.160/27", - "66.249.66.192/27", - "66.249.66.224/27", - "66.249.66.32/27", - "66.249.66.64/27", - "66.249.66.96/27", - "66.249.68.0/27", - "66.249.68.128/27", - "66.249.68.32/27", - "66.249.68.64/27", - "66.249.68.96/27", - "66.249.69.0/27", - "66.249.69.128/27", - "66.249.69.160/27", - "66.249.69.192/27", - "66.249.69.224/27", - "66.249.69.32/27", - "66.249.69.64/27", - "66.249.69.96/27", - "66.249.70.0/27", - "66.249.70.128/27", - "66.249.70.160/27", - "66.249.70.192/27", - "66.249.70.224/27", - "66.249.70.32/27", - "66.249.70.64/27", - "66.249.70.96/27", - "66.249.71.0/27", - "66.249.71.128/27", - "66.249.71.160/27", - "66.249.71.192/27", - "66.249.71.224/27", - "66.249.71.32/27", - "66.249.71.64/27", - "66.249.71.96/27", - "66.249.72.0/27", - "66.249.72.128/27", - "66.249.72.160/27", - "66.249.72.192/27", - "66.249.72.224/27", - "66.249.72.32/27", - "66.249.72.64/27", - "66.249.72.96/27", - "66.249.73.0/27", - "66.249.73.128/27", - "66.249.73.160/27", - "66.249.73.192/27", - "66.249.73.224/27", - "66.249.73.32/27", - "66.249.73.64/27", - "66.249.73.96/27", - "66.249.74.0/27", - "66.249.74.128/27", - "66.249.74.160/27", - "66.249.74.192/27", - "66.249.74.32/27", - "66.249.74.64/27", - "66.249.74.96/27", - "66.249.75.0/27", - "66.249.75.128/27", - "66.249.75.160/27", - "66.249.75.192/27", - "66.249.75.224/27", - "66.249.75.32/27", - "66.249.75.64/27", - "66.249.75.96/27", - "66.249.76.0/27", - "66.249.76.128/27", - "66.249.76.160/27", - "66.249.76.192/27", - "66.249.76.224/27", - "66.249.76.32/27", - "66.249.76.64/27", - "66.249.76.96/27", - "66.249.77.0/27", - "66.249.77.128/27", - "66.249.77.160/27", - "66.249.77.192/27", - "66.249.77.224/27", - "66.249.77.32/27", - "66.249.77.64/27", - "66.249.77.96/27", - "66.249.78.0/27", - "66.249.78.32/27", - "66.249.79.0/27", - "66.249.79.128/27", - "66.249.79.160/27", - "66.249.79.192/27", - "66.249.79.224/27", - "66.249.79.32/27", - "66.249.79.64/27", - "66.249.79.96/27" - ] -- name: bingbot - user_agent_regex: \+http\://www\.bing\.com/bingbot\.htm - action: ALLOW - # https://www.bing.com/toolbox/bingbot.json - remote_addresses: [ - "157.55.39.0/24", - "207.46.13.0/24", - "40.77.167.0/24", - "13.66.139.0/24", - "13.66.144.0/24", - "52.167.144.0/24", - "13.67.10.16/28", - "13.69.66.240/28", - "13.71.172.224/28", - "139.217.52.0/28", - "191.233.204.224/28", - "20.36.108.32/28", - "20.43.120.16/28", - "40.79.131.208/28", - "40.79.186.176/28", - "52.231.148.0/28", - "20.79.107.240/28", - "51.105.67.0/28", - "20.125.163.80/28", - "40.77.188.0/22", - "65.55.210.0/24", - "199.30.24.0/23", - "40.77.202.0/24", - "40.77.139.0/25", - "20.74.197.0/28", - "20.15.133.160/27", - "40.77.177.0/24", - "40.77.178.0/23" - ] -- name: duckduckbot - user_agent_regex: DuckDuckBot/1\.1; \(\+http\://duckduckgo\.com/duckduckbot\.html\) - action: ALLOW - # https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot - remote_addresses: [ - "57.152.72.128/32", - "51.8.253.152/32", - "40.80.242.63/32", - "20.12.141.99/32", - "20.49.136.28/32", - "51.116.131.221/32", - "51.107.40.209/32", - "20.40.133.240/32", - "20.50.168.91/32", - "51.120.48.122/32", - "20.193.45.113/32", - "40.76.173.151/32", - "40.76.163.7/32", - "20.185.79.47/32", - "52.142.26.175/32", - "20.185.79.15/32", - "52.142.24.149/32", - "40.76.162.208/32", - "40.76.163.23/32", - "40.76.162.191/32", - "40.76.162.247/32", - "40.88.21.235/32", - "20.191.45.212/32", - "52.146.59.12/32", - "52.146.59.156/32", - "52.146.59.154/32", - "52.146.58.236/32", - "20.62.224.44/32", - "51.104.180.53/32", - "51.104.180.47/32", - "51.104.180.26/32", - "51.104.146.225/32", - "51.104.146.235/32", - "20.73.202.147/32", - "20.73.132.240/32", - "20.71.12.143/32", - "20.56.197.58/32", - "20.56.197.63/32", - "20.43.150.93/32", - "20.43.150.85/32", - "20.44.222.1/32", - "40.89.243.175/32", - "13.89.106.77/32", - "52.143.242.6/32", - "52.143.241.111/32", - "52.154.60.82/32", - "20.197.209.11/32", - "20.197.209.27/32", - "20.226.133.105/32", - "191.234.216.4/32", - "191.234.216.178/32", - "20.53.92.211/32", - "20.53.91.2/32", - "20.207.99.197/32", - "20.207.97.190/32", - "40.81.250.205/32", - "40.64.106.11/32", - "40.64.105.247/32", - "20.72.242.93/32", - "20.99.255.235/32", - "20.113.3.121/32", - "52.224.16.221/32", - "52.224.21.53/32", - "52.224.20.204/32", - "52.224.21.19/32", - "52.224.20.249/32", - "52.224.20.203/32", - "52.224.20.190/32", - "52.224.16.229/32", - "52.224.21.20/32", - "52.146.63.80/32", - "52.224.20.227/32", - "52.224.20.193/32", - "52.190.37.160/32", - "52.224.21.23/32", - "52.224.20.223/32", - "52.224.20.181/32", - "52.224.21.49/32", - "52.224.21.55/32", - "52.224.21.61/32", - "52.224.19.152/32", - "52.224.20.186/32", - "52.224.21.27/32", - "52.224.21.51/32", - "52.224.20.174/32", - "52.224.21.4/32", - "51.104.164.109/32", - "51.104.167.71/32", - "51.104.160.177/32", - "51.104.162.149/32", - "51.104.167.95/32", - "51.104.167.54/32", - "51.104.166.111/32", - "51.104.167.88/32", - "51.104.161.32/32", - "51.104.163.250/32", - "51.104.164.189/32", - "51.104.167.19/32", - "51.104.160.167/32", - "51.104.167.110/32", - "20.191.44.119/32", - "51.104.167.104/32", - "20.191.44.234/32", - "51.104.164.215/32", - "51.104.167.52/32", - "20.191.44.22/32", - "51.104.167.87/32", - "51.104.167.96/32", - "20.191.44.16/32", - "51.104.167.61/32", - "51.104.164.147/32", - "20.50.48.159/32", - "40.114.182.172/32", - "20.50.50.130/32", - "20.50.50.163/32", - "20.50.50.46/32", - "40.114.182.153/32", - "20.50.50.118/32", - "20.50.49.55/32", - "20.50.49.25/32", - "40.114.183.251/32", - "20.50.50.123/32", - "20.50.49.237/32", - "20.50.48.192/32", - "20.50.50.134/32", - "51.138.90.233/32", - "40.114.183.196/32", - "20.50.50.146/32", - "40.114.183.88/32", - "20.50.50.145/32", - "20.50.50.121/32", - "20.50.49.40/32", - "51.138.90.206/32", - "40.114.182.45/32", - "51.138.90.161/32", - "20.50.49.0/32", - "40.119.232.215/32", - "104.43.55.167/32", - "40.119.232.251/32", - "40.119.232.50/32", - "40.119.232.146/32", - "40.119.232.218/32", - "104.43.54.127/32", - "104.43.55.117/32", - "104.43.55.116/32", - "104.43.55.166/32", - "52.154.169.50/32", - "52.154.171.70/32", - "52.154.170.229/32", - "52.154.170.113/32", - "52.154.171.44/32", - "52.154.172.2/32", - "52.143.244.81/32", - "52.154.171.87/32", - "52.154.171.250/32", - "52.154.170.28/32", - "52.154.170.122/32", - "52.143.243.117/32", - "52.143.247.235/32", - "52.154.171.235/32", - "52.154.171.196/32", - "52.154.171.0/32", - "52.154.170.243/32", - "52.154.170.26/32", - "52.154.169.200/32", - "52.154.170.96/32", - "52.154.170.88/32", - "52.154.171.150/32", - "52.154.171.205/32", - "52.154.170.117/32", - "52.154.170.209/32", - "191.235.202.48/32", - "191.233.3.202/32", - "191.235.201.214/32", - "191.233.3.197/32", - "191.235.202.38/32", - "20.53.78.144/32", - "20.193.24.10/32", - "20.53.78.236/32", - "20.53.78.138/32", - "20.53.78.123/32", - "20.53.78.106/32", - "20.193.27.215/32", - "20.193.25.197/32", - "20.193.12.126/32", - "20.193.24.251/32", - "20.204.242.101/32", - "20.207.72.113/32", - "20.204.242.19/32", - "20.219.45.67/32", - "20.207.72.11/32", - "20.219.45.190/32", - "20.204.243.55/32", - "20.204.241.148/32", - "20.207.72.110/32", - "20.204.240.172/32", - "20.207.72.21/32", - "20.204.246.81/32", - "20.207.107.181/32", - "20.204.246.254/32", - "20.219.43.246/32", - "52.149.25.43/32", - "52.149.61.51/32", - "52.149.58.139/32", - "52.149.60.38/32", - "52.148.165.38/32", - "52.143.95.162/32", - "52.149.56.151/32", - "52.149.30.45/32", - "52.149.58.173/32", - "52.143.95.204/32", - "52.149.28.83/32", - "52.149.58.69/32", - "52.148.161.87/32", - "52.149.58.27/32", - "52.149.28.18/32", - "20.79.226.26/32", - "20.79.239.66/32", - "20.79.238.198/32", - "20.113.14.159/32", - "20.75.144.152/32", - "20.43.172.120/32", - "20.53.134.160/32", - "20.201.15.208/32", - "20.93.28.24/32", - "20.61.34.40/32", - "52.242.224.168/32", - "20.80.129.80/32", - "20.195.108.47/32", - "4.195.133.120/32", - "4.228.76.163/32", - "4.182.131.108/32", - "4.209.224.56/32", - "108.141.83.74/32", - "4.213.46.14/32", - "172.169.17.165/32", - "51.8.71.117/32", - "20.3.1.178/32", - "52.149.56.151/32", - "52.149.30.45/32", - "52.149.58.173/32", - "52.143.95.204/32", - "52.149.28.83/32", - "52.149.58.69/32", - "52.148.161.87/32", - "52.149.58.27/32", - "52.149.28.18/32", - "20.79.226.26/32", - "20.79.239.66/32", - "20.79.238.198/32", - "20.113.14.159/32", - "20.75.144.152/32", - "20.43.172.120/32", - "20.53.134.160/32", - "20.201.15.208/32", - "20.93.28.24/32", - "20.61.34.40/32", - "52.242.224.168/32", - "20.80.129.80/32", - "20.195.108.47/32", - "4.195.133.120/32", - "4.228.76.163/32", - "4.182.131.108/32", - "4.209.224.56/32", - "108.141.83.74/32", - "4.213.46.14/32", - "172.169.17.165/32", - "51.8.71.117/32", - "20.3.1.178/32" - ] -- name: qwantbot - user_agent_regex: \+https\://help\.qwant\.com/bot/ - action: ALLOW - # https://help.qwant.com/wp-content/uploads/sites/2/2025/01/qwantbot.json - remote_addresses: [ "91.242.162.0/24" ] -- name: internet-archive - action: ALLOW - # https://ipinfo.io/AS7941 - remote_addresses: [ - "207.241.224.0/20", - "208.70.24.0/21", - "2620:0:9c0::/48" - ] -- name: kagibot - user_agent_regex: \+https\://kagi\.com/bot - action: ALLOW - # https://kagi.com/bot - remote_addresses: [ - "216.18.205.234/32", - "35.212.27.76/32", - "104.254.65.50/32", - "209.151.156.194/32" - ] -- name: marginalia - user_agent_regex: search\.marginalia\.nu - action: ALLOW - # Received directly over email - remote_addresses: [ - "193.183.0.162/31", - "193.183.0.164/30", - "193.183.0.168/30", - "193.183.0.172/31", - "193.183.0.174/32" - ] -- name: mojeekbot - user_agent_regex: http\://www\.mojeek\.com/bot\.html - action: ALLOW - # https://www.mojeek.com/bot.html - remote_addresses: [ "5.102.173.71/32" ] +# Search engines to allow +- import: (data)/crawlers/googlebot.yaml +- import: (data)/crawlers/bingbot.yaml +- import: (data)/crawlers/duckduckbot.yaml +- import: (data)/crawlers/qwantbot.yaml +- import: (data)/crawlers/internet-archive.yaml +- import: (data)/crawlers/kagibot.yaml +- import: (data)/crawlers/marginalia.yaml +- import: (data)/crawlers/mojeekbot.yaml -# Common "keeping the internet working" routes -- name: well-known - path_regex: ^/.well-known/.*$ - action: ALLOW -- name: favicon - path_regex: ^/favicon.ico$ - action: ALLOW -- name: robots-txt - path_regex: ^/robots.txt$ - action: ALLOW +# Allow common "keeping the internet working" routes (well-known, favicon, robots.txt) +- import: (data)/common/keep-internet-working.yaml # # Punish any bot with "bot" in the user-agent string +# # This is known to have a high false-positive rate, use at your own risk # - name: generic-bot-catchall # user_agent_regex: (?i:bot|crawler) # action: CHALLENGE @@ -654,6 +41,7 @@ bots: # report_as: 4 # lie to the operator # algorithm: slow # intentionally waste CPU cycles and time +# Generic catchall rule - name: generic-browser user_agent_regex: > Mozilla|Opera diff --git a/data/bots/ai-robots-txt.yaml b/data/bots/ai-robots-txt.yaml new file mode 100644 index 0000000..19cbe93 --- /dev/null +++ b/data/bots/ai-robots-txt.yaml @@ -0,0 +1,4 @@ +- name: "ai-robots-txt" + user_agent_regex: > + AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|Perplexity-User|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot + action: DENY \ No newline at end of file diff --git a/data/bots/cloudflare-workers.yaml b/data/bots/cloudflare-workers.yaml new file mode 100644 index 0000000..3fe051b --- /dev/null +++ b/data/bots/cloudflare-workers.yaml @@ -0,0 +1,4 @@ +- name: cloudflare-workers + headers_regex: + CF-Worker: .* + action: DENY \ No newline at end of file diff --git a/data/bots/headless-browsers.yaml b/data/bots/headless-browsers.yaml new file mode 100644 index 0000000..9805290 --- /dev/null +++ b/data/bots/headless-browsers.yaml @@ -0,0 +1,9 @@ +- name: lightpanda + user_agent_regex: ^LightPanda/.*$ + action: DENY +- name: headless-chrome + user_agent_regex: HeadlessChrome + action: DENY +- name: headless-chromium + user_agent_regex: HeadlessChromium + action: DENY \ No newline at end of file diff --git a/data/bots/us-ai-scraper.yaml b/data/bots/us-ai-scraper.yaml new file mode 100644 index 0000000..b68920f --- /dev/null +++ b/data/bots/us-ai-scraper.yaml @@ -0,0 +1,3 @@ +- name: us-artificial-intelligence-scraper + user_agent_regex: \+https\://github\.com/US-Artificial-Intelligence/scraper + action: DENY \ No newline at end of file diff --git a/data/common/allow-private-addresses.yaml b/data/common/allow-private-addresses.yaml new file mode 100644 index 0000000..3a3c0dc --- /dev/null +++ b/data/common/allow-private-addresses.yaml @@ -0,0 +1,15 @@ +- name: ipv4-rfc-1918 + action: ALLOW + remote_addresses: + - 10.0.0.0/8 + - 172.16.0.0/12 + - 192.168.0.0/16 + - 100.64.0.0/10 +- name: ipv6-ula + action: ALLOW + remote_addresses: + - fc00::/7 +- name: ipv6-link-local + action: ALLOW + remote_addresses: + - fe80::/10 \ No newline at end of file diff --git a/data/common/keep-internet-working.yaml b/data/common/keep-internet-working.yaml new file mode 100644 index 0000000..8270ef4 --- /dev/null +++ b/data/common/keep-internet-working.yaml @@ -0,0 +1,10 @@ +# Common "keeping the internet working" routes +- name: well-known + path_regex: ^/.well-known/.*$ + action: ALLOW +- name: favicon + path_regex: ^/favicon.ico$ + action: ALLOW +- name: robots-txt + path_regex: ^/robots.txt$ + action: ALLOW \ No newline at end of file diff --git a/data/crawlers/bingbot.yaml b/data/crawlers/bingbot.yaml new file mode 100644 index 0000000..2f7885d --- /dev/null +++ b/data/crawlers/bingbot.yaml @@ -0,0 +1,34 @@ +- name: bingbot + user_agent_regex: \+http\://www\.bing\.com/bingbot\.htm + action: ALLOW + # https://www.bing.com/toolbox/bingbot.json + remote_addresses: [ + "157.55.39.0/24", + "207.46.13.0/24", + "40.77.167.0/24", + "13.66.139.0/24", + "13.66.144.0/24", + "52.167.144.0/24", + "13.67.10.16/28", + "13.69.66.240/28", + "13.71.172.224/28", + "139.217.52.0/28", + "191.233.204.224/28", + "20.36.108.32/28", + "20.43.120.16/28", + "40.79.131.208/28", + "40.79.186.176/28", + "52.231.148.0/28", + "20.79.107.240/28", + "51.105.67.0/28", + "20.125.163.80/28", + "40.77.188.0/22", + "65.55.210.0/24", + "199.30.24.0/23", + "40.77.202.0/24", + "40.77.139.0/25", + "20.74.197.0/28", + "20.15.133.160/27", + "40.77.177.0/24", + "40.77.178.0/23" + ] diff --git a/data/crawlers/duckduckbot.yaml b/data/crawlers/duckduckbot.yaml new file mode 100644 index 0000000..302a1e3 --- /dev/null +++ b/data/crawlers/duckduckbot.yaml @@ -0,0 +1,275 @@ +- name: duckduckbot + user_agent_regex: DuckDuckBot/1\.1; \(\+http\://duckduckgo\.com/duckduckbot\.html\) + action: ALLOW + # https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot + remote_addresses: [ + "57.152.72.128/32", + "51.8.253.152/32", + "40.80.242.63/32", + "20.12.141.99/32", + "20.49.136.28/32", + "51.116.131.221/32", + "51.107.40.209/32", + "20.40.133.240/32", + "20.50.168.91/32", + "51.120.48.122/32", + "20.193.45.113/32", + "40.76.173.151/32", + "40.76.163.7/32", + "20.185.79.47/32", + "52.142.26.175/32", + "20.185.79.15/32", + "52.142.24.149/32", + "40.76.162.208/32", + "40.76.163.23/32", + "40.76.162.191/32", + "40.76.162.247/32", + "40.88.21.235/32", + "20.191.45.212/32", + "52.146.59.12/32", + "52.146.59.156/32", + "52.146.59.154/32", + "52.146.58.236/32", + "20.62.224.44/32", + "51.104.180.53/32", + "51.104.180.47/32", + "51.104.180.26/32", + "51.104.146.225/32", + "51.104.146.235/32", + "20.73.202.147/32", + "20.73.132.240/32", + "20.71.12.143/32", + "20.56.197.58/32", + "20.56.197.63/32", + "20.43.150.93/32", + "20.43.150.85/32", + "20.44.222.1/32", + "40.89.243.175/32", + "13.89.106.77/32", + "52.143.242.6/32", + "52.143.241.111/32", + "52.154.60.82/32", + "20.197.209.11/32", + "20.197.209.27/32", + "20.226.133.105/32", + "191.234.216.4/32", + "191.234.216.178/32", + "20.53.92.211/32", + "20.53.91.2/32", + "20.207.99.197/32", + "20.207.97.190/32", + "40.81.250.205/32", + "40.64.106.11/32", + "40.64.105.247/32", + "20.72.242.93/32", + "20.99.255.235/32", + "20.113.3.121/32", + "52.224.16.221/32", + "52.224.21.53/32", + "52.224.20.204/32", + "52.224.21.19/32", + "52.224.20.249/32", + "52.224.20.203/32", + "52.224.20.190/32", + "52.224.16.229/32", + "52.224.21.20/32", + "52.146.63.80/32", + "52.224.20.227/32", + "52.224.20.193/32", + "52.190.37.160/32", + "52.224.21.23/32", + "52.224.20.223/32", + "52.224.20.181/32", + "52.224.21.49/32", + "52.224.21.55/32", + "52.224.21.61/32", + "52.224.19.152/32", + "52.224.20.186/32", + "52.224.21.27/32", + "52.224.21.51/32", + "52.224.20.174/32", + "52.224.21.4/32", + "51.104.164.109/32", + "51.104.167.71/32", + "51.104.160.177/32", + "51.104.162.149/32", + "51.104.167.95/32", + "51.104.167.54/32", + "51.104.166.111/32", + "51.104.167.88/32", + "51.104.161.32/32", + "51.104.163.250/32", + "51.104.164.189/32", + "51.104.167.19/32", + "51.104.160.167/32", + "51.104.167.110/32", + "20.191.44.119/32", + "51.104.167.104/32", + "20.191.44.234/32", + "51.104.164.215/32", + "51.104.167.52/32", + "20.191.44.22/32", + "51.104.167.87/32", + "51.104.167.96/32", + "20.191.44.16/32", + "51.104.167.61/32", + "51.104.164.147/32", + "20.50.48.159/32", + "40.114.182.172/32", + "20.50.50.130/32", + "20.50.50.163/32", + "20.50.50.46/32", + "40.114.182.153/32", + "20.50.50.118/32", + "20.50.49.55/32", + "20.50.49.25/32", + "40.114.183.251/32", + "20.50.50.123/32", + "20.50.49.237/32", + "20.50.48.192/32", + "20.50.50.134/32", + "51.138.90.233/32", + "40.114.183.196/32", + "20.50.50.146/32", + "40.114.183.88/32", + "20.50.50.145/32", + "20.50.50.121/32", + "20.50.49.40/32", + "51.138.90.206/32", + "40.114.182.45/32", + "51.138.90.161/32", + "20.50.49.0/32", + "40.119.232.215/32", + "104.43.55.167/32", + "40.119.232.251/32", + "40.119.232.50/32", + "40.119.232.146/32", + "40.119.232.218/32", + "104.43.54.127/32", + "104.43.55.117/32", + "104.43.55.116/32", + "104.43.55.166/32", + "52.154.169.50/32", + "52.154.171.70/32", + "52.154.170.229/32", + "52.154.170.113/32", + "52.154.171.44/32", + "52.154.172.2/32", + "52.143.244.81/32", + "52.154.171.87/32", + "52.154.171.250/32", + "52.154.170.28/32", + "52.154.170.122/32", + "52.143.243.117/32", + "52.143.247.235/32", + "52.154.171.235/32", + "52.154.171.196/32", + "52.154.171.0/32", + "52.154.170.243/32", + "52.154.170.26/32", + "52.154.169.200/32", + "52.154.170.96/32", + "52.154.170.88/32", + "52.154.171.150/32", + "52.154.171.205/32", + "52.154.170.117/32", + "52.154.170.209/32", + "191.235.202.48/32", + "191.233.3.202/32", + "191.235.201.214/32", + "191.233.3.197/32", + "191.235.202.38/32", + "20.53.78.144/32", + "20.193.24.10/32", + "20.53.78.236/32", + "20.53.78.138/32", + "20.53.78.123/32", + "20.53.78.106/32", + "20.193.27.215/32", + "20.193.25.197/32", + "20.193.12.126/32", + "20.193.24.251/32", + "20.204.242.101/32", + "20.207.72.113/32", + "20.204.242.19/32", + "20.219.45.67/32", + "20.207.72.11/32", + "20.219.45.190/32", + "20.204.243.55/32", + "20.204.241.148/32", + "20.207.72.110/32", + "20.204.240.172/32", + "20.207.72.21/32", + "20.204.246.81/32", + "20.207.107.181/32", + "20.204.246.254/32", + "20.219.43.246/32", + "52.149.25.43/32", + "52.149.61.51/32", + "52.149.58.139/32", + "52.149.60.38/32", + "52.148.165.38/32", + "52.143.95.162/32", + "52.149.56.151/32", + "52.149.30.45/32", + "52.149.58.173/32", + "52.143.95.204/32", + "52.149.28.83/32", + "52.149.58.69/32", + "52.148.161.87/32", + "52.149.58.27/32", + "52.149.28.18/32", + "20.79.226.26/32", + "20.79.239.66/32", + "20.79.238.198/32", + "20.113.14.159/32", + "20.75.144.152/32", + "20.43.172.120/32", + "20.53.134.160/32", + "20.201.15.208/32", + "20.93.28.24/32", + "20.61.34.40/32", + "52.242.224.168/32", + "20.80.129.80/32", + "20.195.108.47/32", + "4.195.133.120/32", + "4.228.76.163/32", + "4.182.131.108/32", + "4.209.224.56/32", + "108.141.83.74/32", + "4.213.46.14/32", + "172.169.17.165/32", + "51.8.71.117/32", + "20.3.1.178/32", + "52.149.56.151/32", + "52.149.30.45/32", + "52.149.58.173/32", + "52.143.95.204/32", + "52.149.28.83/32", + "52.149.58.69/32", + "52.148.161.87/32", + "52.149.58.27/32", + "52.149.28.18/32", + "20.79.226.26/32", + "20.79.239.66/32", + "20.79.238.198/32", + "20.113.14.159/32", + "20.75.144.152/32", + "20.43.172.120/32", + "20.53.134.160/32", + "20.201.15.208/32", + "20.93.28.24/32", + "20.61.34.40/32", + "52.242.224.168/32", + "20.80.129.80/32", + "20.195.108.47/32", + "4.195.133.120/32", + "4.228.76.163/32", + "4.182.131.108/32", + "4.209.224.56/32", + "108.141.83.74/32", + "4.213.46.14/32", + "172.169.17.165/32", + "51.8.71.117/32", + "20.3.1.178/32" + ] diff --git a/data/crawlers/googlebot.yaml b/data/crawlers/googlebot.yaml new file mode 100644 index 0000000..f173512 --- /dev/null +++ b/data/crawlers/googlebot.yaml @@ -0,0 +1,263 @@ +- name: googlebot + user_agent_regex: \+http\://www\.google\.com/bot\.html + action: ALLOW + # https://developers.google.com/static/search/apis/ipranges/googlebot.json + remote_addresses: [ + "2001:4860:4801:10::/64", + "2001:4860:4801:11::/64", + "2001:4860:4801:12::/64", + "2001:4860:4801:13::/64", + "2001:4860:4801:14::/64", + "2001:4860:4801:15::/64", + "2001:4860:4801:16::/64", + "2001:4860:4801:17::/64", + "2001:4860:4801:18::/64", + "2001:4860:4801:19::/64", + "2001:4860:4801:1a::/64", + "2001:4860:4801:1b::/64", + "2001:4860:4801:1c::/64", + "2001:4860:4801:1d::/64", + "2001:4860:4801:1e::/64", + "2001:4860:4801:1f::/64", + "2001:4860:4801:20::/64", + "2001:4860:4801:21::/64", + "2001:4860:4801:22::/64", + "2001:4860:4801:23::/64", + "2001:4860:4801:24::/64", + "2001:4860:4801:25::/64", + "2001:4860:4801:26::/64", + "2001:4860:4801:27::/64", + "2001:4860:4801:28::/64", + "2001:4860:4801:29::/64", + "2001:4860:4801:2::/64", + "2001:4860:4801:2a::/64", + "2001:4860:4801:2b::/64", + "2001:4860:4801:2c::/64", + "2001:4860:4801:2d::/64", + "2001:4860:4801:2e::/64", + "2001:4860:4801:2f::/64", + "2001:4860:4801:31::/64", + "2001:4860:4801:32::/64", + "2001:4860:4801:33::/64", + "2001:4860:4801:34::/64", + "2001:4860:4801:35::/64", + "2001:4860:4801:36::/64", + "2001:4860:4801:37::/64", + "2001:4860:4801:38::/64", + "2001:4860:4801:39::/64", + "2001:4860:4801:3a::/64", + "2001:4860:4801:3b::/64", + "2001:4860:4801:3c::/64", + "2001:4860:4801:3d::/64", + "2001:4860:4801:3e::/64", + "2001:4860:4801:40::/64", + "2001:4860:4801:41::/64", + "2001:4860:4801:42::/64", + "2001:4860:4801:43::/64", + "2001:4860:4801:44::/64", + "2001:4860:4801:45::/64", + "2001:4860:4801:46::/64", + "2001:4860:4801:47::/64", + "2001:4860:4801:48::/64", + "2001:4860:4801:49::/64", + "2001:4860:4801:4a::/64", + "2001:4860:4801:4b::/64", + "2001:4860:4801:4c::/64", + "2001:4860:4801:50::/64", + "2001:4860:4801:51::/64", + "2001:4860:4801:52::/64", + "2001:4860:4801:53::/64", + "2001:4860:4801:54::/64", + "2001:4860:4801:55::/64", + "2001:4860:4801:56::/64", + "2001:4860:4801:60::/64", + "2001:4860:4801:61::/64", + "2001:4860:4801:62::/64", + "2001:4860:4801:63::/64", + "2001:4860:4801:64::/64", + "2001:4860:4801:65::/64", + "2001:4860:4801:66::/64", + "2001:4860:4801:67::/64", + "2001:4860:4801:68::/64", + "2001:4860:4801:69::/64", + "2001:4860:4801:6a::/64", + "2001:4860:4801:6b::/64", + "2001:4860:4801:6c::/64", + "2001:4860:4801:6d::/64", + "2001:4860:4801:6e::/64", + "2001:4860:4801:6f::/64", + "2001:4860:4801:70::/64", + "2001:4860:4801:71::/64", + "2001:4860:4801:72::/64", + "2001:4860:4801:73::/64", + "2001:4860:4801:74::/64", + "2001:4860:4801:75::/64", + "2001:4860:4801:76::/64", + "2001:4860:4801:77::/64", + "2001:4860:4801:78::/64", + "2001:4860:4801:79::/64", + "2001:4860:4801:80::/64", + "2001:4860:4801:81::/64", + "2001:4860:4801:82::/64", + "2001:4860:4801:83::/64", + "2001:4860:4801:84::/64", + "2001:4860:4801:85::/64", + "2001:4860:4801:86::/64", + "2001:4860:4801:87::/64", + "2001:4860:4801:88::/64", + "2001:4860:4801:90::/64", + "2001:4860:4801:91::/64", + "2001:4860:4801:92::/64", + "2001:4860:4801:93::/64", + "2001:4860:4801:94::/64", + "2001:4860:4801:95::/64", + "2001:4860:4801:96::/64", + "2001:4860:4801:a0::/64", + "2001:4860:4801:a1::/64", + "2001:4860:4801:a2::/64", + "2001:4860:4801:a3::/64", + "2001:4860:4801:a4::/64", + "2001:4860:4801:a5::/64", + "2001:4860:4801:c::/64", + "2001:4860:4801:f::/64", + "192.178.5.0/27", + "192.178.6.0/27", + "192.178.6.128/27", + "192.178.6.160/27", + "192.178.6.192/27", + "192.178.6.32/27", + "192.178.6.64/27", + "192.178.6.96/27", + "34.100.182.96/28", + "34.101.50.144/28", + "34.118.254.0/28", + "34.118.66.0/28", + "34.126.178.96/28", + "34.146.150.144/28", + "34.147.110.144/28", + "34.151.74.144/28", + "34.152.50.64/28", + "34.154.114.144/28", + "34.155.98.32/28", + "34.165.18.176/28", + "34.175.160.64/28", + "34.176.130.16/28", + "34.22.85.0/27", + "34.64.82.64/28", + "34.65.242.112/28", + "34.80.50.80/28", + "34.88.194.0/28", + "34.89.10.80/28", + "34.89.198.80/28", + "34.96.162.48/28", + "35.247.243.240/28", + "66.249.64.0/27", + "66.249.64.128/27", + "66.249.64.160/27", + "66.249.64.224/27", + "66.249.64.32/27", + "66.249.64.64/27", + "66.249.64.96/27", + "66.249.65.0/27", + "66.249.65.128/27", + "66.249.65.160/27", + "66.249.65.192/27", + "66.249.65.224/27", + "66.249.65.32/27", + "66.249.65.64/27", + "66.249.65.96/27", + "66.249.66.0/27", + "66.249.66.128/27", + "66.249.66.160/27", + "66.249.66.192/27", + "66.249.66.224/27", + "66.249.66.32/27", + "66.249.66.64/27", + "66.249.66.96/27", + "66.249.68.0/27", + "66.249.68.128/27", + "66.249.68.32/27", + "66.249.68.64/27", + "66.249.68.96/27", + "66.249.69.0/27", + "66.249.69.128/27", + "66.249.69.160/27", + "66.249.69.192/27", + "66.249.69.224/27", + "66.249.69.32/27", + "66.249.69.64/27", + "66.249.69.96/27", + "66.249.70.0/27", + "66.249.70.128/27", + "66.249.70.160/27", + "66.249.70.192/27", + "66.249.70.224/27", + "66.249.70.32/27", + "66.249.70.64/27", + "66.249.70.96/27", + "66.249.71.0/27", + "66.249.71.128/27", + "66.249.71.160/27", + "66.249.71.192/27", + "66.249.71.224/27", + "66.249.71.32/27", + "66.249.71.64/27", + "66.249.71.96/27", + "66.249.72.0/27", + "66.249.72.128/27", + "66.249.72.160/27", + "66.249.72.192/27", + "66.249.72.224/27", + "66.249.72.32/27", + "66.249.72.64/27", + "66.249.72.96/27", + "66.249.73.0/27", + "66.249.73.128/27", + "66.249.73.160/27", + "66.249.73.192/27", + "66.249.73.224/27", + "66.249.73.32/27", + "66.249.73.64/27", + "66.249.73.96/27", + "66.249.74.0/27", + "66.249.74.128/27", + "66.249.74.160/27", + "66.249.74.192/27", + "66.249.74.32/27", + "66.249.74.64/27", + "66.249.74.96/27", + "66.249.75.0/27", + "66.249.75.128/27", + "66.249.75.160/27", + "66.249.75.192/27", + "66.249.75.224/27", + "66.249.75.32/27", + "66.249.75.64/27", + "66.249.75.96/27", + "66.249.76.0/27", + "66.249.76.128/27", + "66.249.76.160/27", + "66.249.76.192/27", + "66.249.76.224/27", + "66.249.76.32/27", + "66.249.76.64/27", + "66.249.76.96/27", + "66.249.77.0/27", + "66.249.77.128/27", + "66.249.77.160/27", + "66.249.77.192/27", + "66.249.77.224/27", + "66.249.77.32/27", + "66.249.77.64/27", + "66.249.77.96/27", + "66.249.78.0/27", + "66.249.78.32/27", + "66.249.79.0/27", + "66.249.79.128/27", + "66.249.79.160/27", + "66.249.79.192/27", + "66.249.79.224/27", + "66.249.79.32/27", + "66.249.79.64/27", + "66.249.79.96/27" + ] diff --git a/data/crawlers/internet-archive.yaml b/data/crawlers/internet-archive.yaml new file mode 100644 index 0000000..5e209e8 --- /dev/null +++ b/data/crawlers/internet-archive.yaml @@ -0,0 +1,8 @@ +- name: internet-archive + action: ALLOW + # https://ipinfo.io/AS7941 + remote_addresses: [ + "207.241.224.0/20", + "208.70.24.0/21", + "2620:0:9c0::/48" + ] \ No newline at end of file diff --git a/data/crawlers/kagibot.yaml b/data/crawlers/kagibot.yaml new file mode 100644 index 0000000..db62b57 --- /dev/null +++ b/data/crawlers/kagibot.yaml @@ -0,0 +1,10 @@ +- name: kagibot + user_agent_regex: \+https\://kagi\.com/bot + action: ALLOW + # https://kagi.com/bot + remote_addresses: [ + "216.18.205.234/32", + "35.212.27.76/32", + "104.254.65.50/32", + "209.151.156.194/32" + ] diff --git a/data/crawlers/marginalia.yaml b/data/crawlers/marginalia.yaml new file mode 100644 index 0000000..e12ebc4 --- /dev/null +++ b/data/crawlers/marginalia.yaml @@ -0,0 +1,11 @@ +- name: marginalia + user_agent_regex: search\.marginalia\.nu + action: ALLOW + # Received directly over email + remote_addresses: [ + "193.183.0.162/31", + "193.183.0.164/30", + "193.183.0.168/30", + "193.183.0.172/31", + "193.183.0.174/32" + ] \ No newline at end of file diff --git a/data/crawlers/mojeekbot.yaml b/data/crawlers/mojeekbot.yaml new file mode 100644 index 0000000..fcd20f5 --- /dev/null +++ b/data/crawlers/mojeekbot.yaml @@ -0,0 +1,5 @@ +- name: mojeekbot + user_agent_regex: http\://www\.mojeek\.com/bot\.html + action: ALLOW + # https://www.mojeek.com/bot.html + remote_addresses: [ "5.102.173.71/32" ] \ No newline at end of file diff --git a/data/crawlers/qwantbot.yaml b/data/crawlers/qwantbot.yaml new file mode 100644 index 0000000..a402154 --- /dev/null +++ b/data/crawlers/qwantbot.yaml @@ -0,0 +1,5 @@ +- name: qwantbot + user_agent_regex: \+https\://help\.qwant\.com/bot/ + action: ALLOW + # https://help.qwant.com/wp-content/uploads/sites/2/2025/01/qwantbot.json + remote_addresses: [ "91.242.162.0/24" ] diff --git a/data/embed.go b/data/embed.go index c1fbe68..ebb2152 100644 --- a/data/embed.go +++ b/data/embed.go @@ -3,6 +3,6 @@ package data import "embed" var ( - //go:embed botPolicies.yaml botPolicies.json + //go:embed botPolicies.yaml botPolicies.json apps bots common crawlers BotPolicies embed.FS ) diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md index 71cc42a..045c34d 100644 --- a/docs/docs/CHANGELOG.md +++ b/docs/docs/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- Enable [importing configuration snippets](./admin/configuration/import.mdx) ([#321](https://github.com/TecharoHQ/anubis/pull/321)) - Refactor check logic to be more generic and work on a Checker type - Add more AI user agents based on the [ai.robots.txt](https://github.com/ai-robots-txt/ai.robots.txt) project - Embedded challenge data in initial HTML response to improve performance diff --git a/docs/docs/admin/configuration/import.mdx b/docs/docs/admin/configuration/import.mdx new file mode 100644 index 0000000..9934ce7 --- /dev/null +++ b/docs/docs/admin/configuration/import.mdx @@ -0,0 +1,147 @@ +# Importing configuration rules + +import Tabs from "@theme/Tabs"; +import TabItem from "@theme/TabItem"; + +Anubis has the ability to let you import snippets of configuration into the main configuration file. This allows you to break up your config into smaller parts that get logically assembled into one big file. + +EG: + + + + +```json +{ + "bots": [ + { + "import": "(data)/bots/ai-robots-txt.yaml" + }, + { + "import": "(data)/bots/cloudflare-workers.yaml" + } + ] +} +``` + + + + +```yaml +bots: + # Pathological bots to deny + - # This correlates to data/bots/ai-robots-txt.yaml in the source tree + import: (data)/bots/ai-robots-txt.yaml + - import: (data)/bots/cloudflare-workers.yaml +``` + + + + +Of note, a bot rule can either have inline bot configuration or import a bot config snippet. You cannot do both in a single bot rule. + + + + +```json +{ + "bots": [ + { + "import": "(data)/bots/ai-robots-txt.yaml", + "name": "generic-browser", + "user_agent_regex": "Mozilla|Opera\n", + "action": "CHALLENGE" + } + ] +} +``` + + + + +```yaml +bots: + - import: (data)/bots/ai-robots-txt.yaml + name: generic-browser + user_agent_regex: > + Mozilla|Opera + action: CHALLENGE +``` + + + + +This will return an error like this: + +```text +config is not valid: +config.BotOrImport: rule definition is invalid, you must set either bot rules or an import statement, not both +``` + +Paths can either be prefixed with `(data)` to import from the [the data folder in the Anubis source tree](https://github.com/TecharoHQ/anubis/tree/main/data) or anywhere on the filesystem. If you don't have access to the Anubis source tree, check /usr/share/docs/anubis/data or in the tarball you extracted Anubis from. + +## Writing snippets + +Snippets can be written in either JSON or YAML, with a preference for YAML. When writing a snippet, write the bot rules you want directly at the top level of the file in a list. + +Here is an example snippet that allows [IPv6 Unique Local Addresses](https://en.wikipedia.org/wiki/Unique_local_address) through Anubis: + + + + +```json +[ + { + "name": "ipv6-ula", + "action": "ALLOW", + "remote_addresses": ["fc00::/7"] + } +] +``` + + + + +```yaml +- name: ipv6-ula + action: ALLOW + remote_addresses: + - fc00::/7 +``` + + + + +## Extracting Anubis' embedded filesystem + +You can always extract the list of rules embedded into the Anubis binary with this command: + +```text +anubis --extract-resources=static +``` + +This will dump the contents of Anubis' embedded data to a new folder named `static`: + +```text +static +├── apps +│ └── gitea-rss-feeds.yaml +├── botPolicies.json +├── botPolicies.yaml +├── bots +│ ├── ai-robots-txt.yaml +│ ├── cloudflare-workers.yaml +│ ├── headless-browsers.yaml +│ └── us-ai-scraper.yaml +├── common +│ ├── allow-private-addresses.yaml +│ └── keep-internet-working.yaml +└── crawlers + ├── bingbot.yaml + ├── duckduckbot.yaml + ├── googlebot.yaml + ├── internet-archive.yaml + ├── kagibot.yaml + ├── marginalia.yaml + ├── mojeekbot.yaml + └── qwantbot.yaml +``` diff --git a/docs/docs/admin/policies.mdx b/docs/docs/admin/policies.mdx index 11af725..b23a62f 100644 --- a/docs/docs/admin/policies.mdx +++ b/docs/docs/admin/policies.mdx @@ -12,6 +12,7 @@ Bot policies let you customize the rules that Anubis uses to allow, deny, or cha - Request path - User agent string - HTTP request header values +- [Importing other configuration snippets](./configuration/import.mdx) As of version v1.17.0 or later, configuration can be written in either JSON or YAML. diff --git a/lib/policy/config/config.go b/lib/policy/config/config.go index b3d5cac..627e9cf 100644 --- a/lib/policy/config/config.go +++ b/lib/policy/config/config.go @@ -3,8 +3,15 @@ package config import ( "errors" "fmt" + "io" + "io/fs" "net" + "os" "regexp" + "strings" + + "github.com/TecharoHQ/anubis/data" + "k8s.io/apimachinery/pkg/util/yaml" ) var ( @@ -17,6 +24,9 @@ var ( ErrInvalidPathRegex = errors.New("config.Bot: invalid path regex") ErrInvalidHeadersRegex = errors.New("config.Bot: invalid headers regex") ErrInvalidCIDR = errors.New("config.Bot: invalid CIDR") + ErrInvalidImportStatement = errors.New("config.ImportStatement: invalid source file") + ErrCantSetBotAndImportValuesAtOnce = errors.New("config.BotOrImport: can't set bot rules and import values at the same time") + ErrMustSetBotOrImportRules = errors.New("config.BotOrImport: rule definition is invalid, you must set either bot rules or an import statement, not both") ) type Rule string @@ -47,6 +57,24 @@ type BotConfig struct { Challenge *ChallengeRules `json:"challenge,omitempty"` } +func (b BotConfig) Zero() bool { + for _, cond := range []bool{ + b.Name != "", + b.UserAgentRegex != nil, + b.PathRegex != nil, + len(b.HeadersRegex) != 0, + b.Action != "", + len(b.RemoteAddr) != 0, + b.Challenge != nil, + } { + if cond { + return false + } + } + + return true +} + func (b BotConfig) Valid() error { var errs []error @@ -151,9 +179,147 @@ func (cr ChallengeRules) Valid() error { return nil } +type ImportStatement struct { + Import string `json:"import"` + Bots []BotConfig +} + +func (is *ImportStatement) open() (fs.File, error) { + if strings.HasPrefix(is.Import, "(data)/") { + fname := strings.TrimPrefix(is.Import, "(data)/") + fin, err := data.BotPolicies.Open(fname) + return fin, err + } + + return os.Open(is.Import) +} + +func (is *ImportStatement) load() error { + fin, err := is.open() + if err != nil { + return fmt.Errorf("can't open %s: %w", is.Import, err) + } + defer fin.Close() + + var result []BotConfig + + if err := yaml.NewYAMLToJSONDecoder(fin).Decode(&result); err != nil { + return fmt.Errorf("can't parse %s: %w", is.Import, err) + } + + var errs []error + + for _, b := range result { + if err := b.Valid(); err != nil { + errs = append(errs, err) + } + } + + if len(errs) != 0 { + return fmt.Errorf("config %s is not valid:\n%w", is.Import, errors.Join(errs...)) + } + + is.Bots = result + + return nil +} + +func (is *ImportStatement) Valid() error { + return is.load() +} + +type BotOrImport struct { + *BotConfig `json:",inline"` + *ImportStatement `json:",inline"` +} + +func (boi *BotOrImport) Valid() error { + if boi.BotConfig != nil && boi.ImportStatement != nil { + return ErrCantSetBotAndImportValuesAtOnce + } + + if boi.BotConfig != nil { + return boi.BotConfig.Valid() + } + + if boi.ImportStatement != nil { + return boi.ImportStatement.Valid() + } + + return ErrMustSetBotOrImportRules +} + +type fileConfig struct { + Bots []BotOrImport `json:"bots"` + DNSBL bool `json:"dnsbl"` +} + +func (c fileConfig) Valid() error { + var errs []error + + if len(c.Bots) == 0 { + errs = append(errs, ErrNoBotRulesDefined) + } + + for _, b := range c.Bots { + if err := b.Valid(); err != nil { + errs = append(errs, err) + } + } + + if len(errs) != 0 { + return fmt.Errorf("config is not valid:\n%w", errors.Join(errs...)) + } + + return nil +} + +func Load(fin io.Reader, fname string) (*Config, error) { + var c fileConfig + if err := yaml.NewYAMLToJSONDecoder(fin).Decode(&c); err != nil { + return nil, fmt.Errorf("can't parse policy config YAML %s: %w", fname, err) + } + + if err := c.Valid(); err != nil { + return nil, err + } + + result := &Config{ + DNSBL: c.DNSBL, + } + + var validationErrs []error + + for _, boi := range c.Bots { + if boi.ImportStatement != nil { + if err := boi.load(); err != nil { + validationErrs = append(validationErrs, err) + continue + } + + result.Bots = append(result.Bots, boi.ImportStatement.Bots...) + } + + if boi.BotConfig != nil { + if err := boi.BotConfig.Valid(); err != nil { + validationErrs = append(validationErrs, err) + continue + } + + result.Bots = append(result.Bots, *boi.BotConfig) + } + } + + if len(validationErrs) > 0 { + return nil, fmt.Errorf("errors validating policy config %s: %w", fname, errors.Join(validationErrs...)) + } + + return result, nil +} + type Config struct { - Bots []BotConfig `json:"bots"` - DNSBL bool `json:"dnsbl"` + Bots []BotConfig + DNSBL bool } func (c Config) Valid() error { diff --git a/lib/policy/config/config_test.go b/lib/policy/config/config_test.go index 4176126..86c490e 100644 --- a/lib/policy/config/config_test.go +++ b/lib/policy/config/config_test.go @@ -2,10 +2,12 @@ package config import ( "errors" + "io/fs" "os" "path/filepath" "testing" + "github.com/TecharoHQ/anubis/data" "k8s.io/apimachinery/pkg/util/yaml" ) @@ -219,13 +221,69 @@ func TestConfigValidKnownGood(t *testing.T) { } defer fin.Close() - var c Config - if err := yaml.NewYAMLToJSONDecoder(fin).Decode(&c); err != nil { - t.Fatalf("can't decode file: %v", err) + c, err := Load(fin, st.Name()) + if err != nil { + t.Fatal(err) } if err := c.Valid(); err != nil { - t.Fatal(err) + t.Error(err) + } + + if len(c.Bots) == 0 { + t.Error("wanted more than 0 bots, got zero") + } + }) + } +} + +func TestImportStatement(t *testing.T) { + type testCase struct { + name string + importPath string + err error + } + + var tests []testCase + + for _, folderName := range []string{ + "apps", + "bots", + "common", + "crawlers", + } { + if err := fs.WalkDir(data.BotPolicies, folderName, func(path string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + if d.IsDir() { + return nil + } + + tests = append(tests, testCase{ + name: "(data)/" + path, + importPath: "(data)/" + path, + err: nil, + }) + + return nil + }); err != nil { + t.Fatal(err) + } + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + is := &ImportStatement{ + Import: tt.importPath, + } + + if err := is.Valid(); err != nil { + t.Errorf("validation error: %v", err) + } + + if len(is.Bots) == 0 { + t.Error("wanted bot definitions, but got none") } }) } @@ -246,7 +304,7 @@ func TestConfigValidBad(t *testing.T) { } defer fin.Close() - var c Config + var c fileConfig if err := yaml.NewYAMLToJSONDecoder(fin).Decode(&c); err != nil { t.Fatalf("can't decode file: %v", err) } @@ -259,3 +317,49 @@ func TestConfigValidBad(t *testing.T) { }) } } + +func TestBotConfigZero(t *testing.T) { + var b BotConfig + if !b.Zero() { + t.Error("zero value BotConfig is not zero value") + } + + b.Name = "hi" + if b.Zero() { + t.Error("BotConfig with name is zero value") + } + + b.UserAgentRegex = p(".*") + if b.Zero() { + t.Error("BotConfig with user agent regex is zero value") + } + + b.PathRegex = p(".*") + if b.Zero() { + t.Error("BotConfig with path regex is zero value") + } + + b.HeadersRegex = map[string]string{"hi": "there"} + if b.Zero() { + t.Error("BotConfig with headers regex is zero value") + } + + b.Action = RuleAllow + if b.Zero() { + t.Error("BotConfig with action is zero value") + } + + b.RemoteAddr = []string{"::/0"} + if b.Zero() { + t.Error("BotConfig with remote addresses is zero value") + } + + b.Challenge = &ChallengeRules{ + Difficulty: 4, + ReportAs: 4, + Algorithm: AlgorithmFast, + } + if b.Zero() { + t.Error("BotConfig with challenge rules is zero value") + } +} diff --git a/lib/policy/config/testdata/bad/import_and_bot.json b/lib/policy/config/testdata/bad/import_and_bot.json new file mode 100644 index 0000000..7fa4255 --- /dev/null +++ b/lib/policy/config/testdata/bad/import_and_bot.json @@ -0,0 +1,10 @@ +{ + "bots": [ + { + "import": "(data)/bots/ai-robots-txt.yaml", + "name": "generic-browser", + "user_agent_regex": "Mozilla|Opera\n", + "action": "CHALLENGE" + } + ] +} \ No newline at end of file diff --git a/lib/policy/config/testdata/bad/import_and_bot.yaml b/lib/policy/config/testdata/bad/import_and_bot.yaml new file mode 100644 index 0000000..0080b10 --- /dev/null +++ b/lib/policy/config/testdata/bad/import_and_bot.yaml @@ -0,0 +1,6 @@ +bots: +- import: (data)/bots/ai-robots-txt.yaml + name: generic-browser + user_agent_regex: > + Mozilla|Opera + action: CHALLENGE \ No newline at end of file diff --git a/lib/policy/config/testdata/bad/import_invalid_file.json b/lib/policy/config/testdata/bad/import_invalid_file.json new file mode 100644 index 0000000..c7546c0 --- /dev/null +++ b/lib/policy/config/testdata/bad/import_invalid_file.json @@ -0,0 +1,7 @@ +{ + "bots": [ + { + "import": "(data)/does-not-exist-fake-file.yaml" + } + ] +} \ No newline at end of file diff --git a/lib/policy/config/testdata/bad/import_invalid_file.yaml b/lib/policy/config/testdata/bad/import_invalid_file.yaml new file mode 100644 index 0000000..df78c06 --- /dev/null +++ b/lib/policy/config/testdata/bad/import_invalid_file.yaml @@ -0,0 +1,2 @@ +bots: +- import: (data)/does-not-exist-fake-file.yaml \ No newline at end of file diff --git a/lib/policy/config/testdata/good/import_filesystem.json b/lib/policy/config/testdata/good/import_filesystem.json new file mode 100644 index 0000000..23480c9 --- /dev/null +++ b/lib/policy/config/testdata/good/import_filesystem.json @@ -0,0 +1,7 @@ +{ + "bots": [ + { + "import": "./testdata/hack-test.json" + } + ] +} \ No newline at end of file diff --git a/lib/policy/config/testdata/good/import_filesystem.yaml b/lib/policy/config/testdata/good/import_filesystem.yaml new file mode 100644 index 0000000..422ccc4 --- /dev/null +++ b/lib/policy/config/testdata/good/import_filesystem.yaml @@ -0,0 +1,2 @@ +bots: +- import: ./testdata/hack-test.yaml \ No newline at end of file diff --git a/lib/policy/config/testdata/good/import_keep_internet_working.json b/lib/policy/config/testdata/good/import_keep_internet_working.json new file mode 100644 index 0000000..68ff2db --- /dev/null +++ b/lib/policy/config/testdata/good/import_keep_internet_working.json @@ -0,0 +1,7 @@ +{ + "bots": [ + { + "import": "(data)/common/keep-internet-working.yaml" + } + ] +} \ No newline at end of file diff --git a/lib/policy/config/testdata/good/import_keep_internet_working.yaml b/lib/policy/config/testdata/good/import_keep_internet_working.yaml new file mode 100644 index 0000000..923ffe3 --- /dev/null +++ b/lib/policy/config/testdata/good/import_keep_internet_working.yaml @@ -0,0 +1,2 @@ +bots: +- import: (data)/common/keep-internet-working.yaml \ No newline at end of file diff --git a/lib/policy/config/testdata/hack-test.json b/lib/policy/config/testdata/hack-test.json new file mode 100644 index 0000000..652dcd8 --- /dev/null +++ b/lib/policy/config/testdata/hack-test.json @@ -0,0 +1,9 @@ +[ + { + "name": "ipv6-ula", + "action": "ALLOW", + "remote_addresses": [ + "fc00::/7" + ] + } +] \ No newline at end of file diff --git a/lib/policy/config/testdata/hack-test.yaml b/lib/policy/config/testdata/hack-test.yaml new file mode 100644 index 0000000..cd4d7d0 --- /dev/null +++ b/lib/policy/config/testdata/hack-test.yaml @@ -0,0 +1,3 @@ +- name: well-known + path_regex: ^/.well-known/.*$ + action: ALLOW \ No newline at end of file diff --git a/lib/policy/policy.go b/lib/policy/policy.go index 368768b..7c45ff6 100644 --- a/lib/policy/policy.go +++ b/lib/policy/policy.go @@ -7,7 +7,6 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" - "k8s.io/apimachinery/pkg/util/yaml" "github.com/TecharoHQ/anubis/lib/policy/config" ) @@ -20,26 +19,22 @@ var ( ) type ParsedConfig struct { - orig config.Config + orig *config.Config Bots []Bot DNSBL bool DefaultDifficulty int } -func NewParsedConfig(orig config.Config) *ParsedConfig { +func NewParsedConfig(orig *config.Config) *ParsedConfig { return &ParsedConfig{ orig: orig, } } func ParseConfig(fin io.Reader, fname string, defaultDifficulty int) (*ParsedConfig, error) { - var c config.Config - if err := yaml.NewYAMLToJSONDecoder(fin).Decode(&c); err != nil { - return nil, fmt.Errorf("can't parse policy config YAML %s: %w", fname, err) - } - - if err := c.Valid(); err != nil { + c, err := config.Load(fin, fname) + if err != nil { return nil, err } diff --git a/lib/policy/testdata/hack-test.json b/lib/policy/testdata/hack-test.json new file mode 100644 index 0000000..652dcd8 --- /dev/null +++ b/lib/policy/testdata/hack-test.json @@ -0,0 +1,9 @@ +[ + { + "name": "ipv6-ula", + "action": "ALLOW", + "remote_addresses": [ + "fc00::/7" + ] + } +] \ No newline at end of file diff --git a/lib/policy/testdata/hack-test.yaml b/lib/policy/testdata/hack-test.yaml new file mode 100644 index 0000000..cd4d7d0 --- /dev/null +++ b/lib/policy/testdata/hack-test.yaml @@ -0,0 +1,3 @@ +- name: well-known + path_regex: ^/.well-known/.*$ + action: ALLOW \ No newline at end of file