From d40b5cfdab11c62dc2ed226bde32b19ea7107f21 Mon Sep 17 00:00:00 2001 From: Xe Iaso Date: Sun, 20 Apr 2025 20:09:27 -0400 Subject: [PATCH] lib: move config to yaml (#307) * lib: move config to yaml Signed-off-by: Xe Iaso * web: run go generate Signed-off-by: Xe Iaso * Add Haiku to known instances (#304) Signed-off-by: Asmodeus <46908100+AsmodeumX@users.noreply.github.com> * Add headers bot rule (#300) * Closes #291: add headers support to bot policy rules * Fix config validator * update docs for JSON -> YAML Signed-off-by: Xe Iaso * docs: document http header based actions Signed-off-by: Xe Iaso * lib: add missing test Signed-off-by: Xe Iaso * Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Xe Iaso --------- Signed-off-by: Xe Iaso Signed-off-by: Asmodeus <46908100+AsmodeumX@users.noreply.github.com> Co-authored-by: Asmodeus <46908100+AsmodeumX@users.noreply.github.com> Co-authored-by: Neur0toxine Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- data/botPolicies.json | 9 +- data/botPolicies.yaml | 662 ++++++++++++++++++ data/embed.go | 2 +- docs/docs/CHANGELOG.md | 1 + .../admin/environments/docker-compose.mdx | 4 +- docs/docs/admin/installation.mdx | 2 +- docs/docs/admin/native-install.mdx | 8 +- docs/docs/admin/{policies.md => policies.mdx} | 110 +++ docs/docs/index.mdx | 2 +- go.mod | 3 + go.sum | 6 + lib/anubis.go | 4 +- lib/anubis_test.go | 19 +- lib/policy/config/config_test.go | 7 +- .../config/testdata/bad/badregexes.yaml | 7 + lib/policy/config/testdata/bad/invalid.yaml | 1 + lib/policy/config/testdata/bad/nobots.yaml | 1 + .../config/testdata/good/allow_everyone.yaml | 6 + .../testdata/good/block_cf_workers.yaml | 5 + .../testdata/good/challengemozilla.yaml | 4 + .../testdata/good/everything_blocked.yaml | 4 + lib/policy/policy.go | 6 +- 22 files changed, 854 insertions(+), 19 deletions(-) create mode 100644 data/botPolicies.yaml rename docs/docs/admin/{policies.md => policies.mdx} (75%) create mode 100644 lib/policy/config/testdata/bad/badregexes.yaml create mode 100644 lib/policy/config/testdata/bad/invalid.yaml create mode 100644 lib/policy/config/testdata/bad/nobots.yaml create mode 100644 lib/policy/config/testdata/good/allow_everyone.yaml create mode 100644 lib/policy/config/testdata/good/block_cf_workers.yaml create mode 100644 lib/policy/config/testdata/good/challengemozilla.yaml create mode 100644 lib/policy/config/testdata/good/everything_blocked.yaml diff --git a/data/botPolicies.json b/data/botPolicies.json index dbc3d35..1993d22 100644 --- a/data/botPolicies.json +++ b/data/botPolicies.json @@ -1,5 +1,12 @@ { "bots": [ + { + "name": "cloudflare-workers", + "headers_regex": { + "CF-Worker": ".*" + }, + "action": "DENY" + }, { "name": "ai-robots-txt", "user_agent_regex": "AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|Perplexity-User|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot", @@ -680,4 +687,4 @@ } ], "dnsbl": false -} +} \ No newline at end of file diff --git a/data/botPolicies.yaml b/data/botPolicies.yaml new file mode 100644 index 0000000..e8448ac --- /dev/null +++ b/data/botPolicies.yaml @@ -0,0 +1,662 @@ +bots: +# Pathological bots to deny +- name: us-artificial-intelligence-scraper + user_agent_regex: \+https\://github\.com/US-Artificial-Intelligence/scraper + action: DENY +- name: lightpanda + user_agent_regex: ^LightPanda/.*$ + action: DENY +- name: headless-chrome + user_agent_regex: HeadlessChrome + action: DENY +- name: headless-chromium + user_agent_regex: HeadlessChromium + action: DENY +- name: "ai-robots-txt" + user_agent_regex: > + AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|Perplexity-User|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot + action: DENY +- name: cloudflare-workers + headers_regex: + CF-Worker: .* + action: DENY + +# search engines to allow +- name: googlebot + user_agent_regex: \+http\://www\.google\.com/bot\.html + action: ALLOW + # https://developers.google.com/static/search/apis/ipranges/googlebot.json + remote_addresses: [ + "2001:4860:4801:10::/64", + "2001:4860:4801:11::/64", + "2001:4860:4801:12::/64", + "2001:4860:4801:13::/64", + "2001:4860:4801:14::/64", + "2001:4860:4801:15::/64", + "2001:4860:4801:16::/64", + "2001:4860:4801:17::/64", + "2001:4860:4801:18::/64", + "2001:4860:4801:19::/64", + "2001:4860:4801:1a::/64", + "2001:4860:4801:1b::/64", + "2001:4860:4801:1c::/64", + "2001:4860:4801:1d::/64", + "2001:4860:4801:1e::/64", + "2001:4860:4801:1f::/64", + "2001:4860:4801:20::/64", + "2001:4860:4801:21::/64", + "2001:4860:4801:22::/64", + "2001:4860:4801:23::/64", + "2001:4860:4801:24::/64", + "2001:4860:4801:25::/64", + "2001:4860:4801:26::/64", + "2001:4860:4801:27::/64", + "2001:4860:4801:28::/64", + "2001:4860:4801:29::/64", + "2001:4860:4801:2::/64", + "2001:4860:4801:2a::/64", + "2001:4860:4801:2b::/64", + "2001:4860:4801:2c::/64", + "2001:4860:4801:2d::/64", + "2001:4860:4801:2e::/64", + "2001:4860:4801:2f::/64", + "2001:4860:4801:31::/64", + "2001:4860:4801:32::/64", + "2001:4860:4801:33::/64", + "2001:4860:4801:34::/64", + "2001:4860:4801:35::/64", + "2001:4860:4801:36::/64", + "2001:4860:4801:37::/64", + "2001:4860:4801:38::/64", + "2001:4860:4801:39::/64", + "2001:4860:4801:3a::/64", + "2001:4860:4801:3b::/64", + "2001:4860:4801:3c::/64", + "2001:4860:4801:3d::/64", + "2001:4860:4801:3e::/64", + "2001:4860:4801:40::/64", + "2001:4860:4801:41::/64", + "2001:4860:4801:42::/64", + "2001:4860:4801:43::/64", + "2001:4860:4801:44::/64", + "2001:4860:4801:45::/64", + "2001:4860:4801:46::/64", + "2001:4860:4801:47::/64", + "2001:4860:4801:48::/64", + "2001:4860:4801:49::/64", + "2001:4860:4801:4a::/64", + "2001:4860:4801:4b::/64", + "2001:4860:4801:4c::/64", + "2001:4860:4801:50::/64", + "2001:4860:4801:51::/64", + "2001:4860:4801:52::/64", + "2001:4860:4801:53::/64", + "2001:4860:4801:54::/64", + "2001:4860:4801:55::/64", + "2001:4860:4801:56::/64", + "2001:4860:4801:60::/64", + "2001:4860:4801:61::/64", + "2001:4860:4801:62::/64", + "2001:4860:4801:63::/64", + "2001:4860:4801:64::/64", + "2001:4860:4801:65::/64", + "2001:4860:4801:66::/64", + "2001:4860:4801:67::/64", + "2001:4860:4801:68::/64", + "2001:4860:4801:69::/64", + "2001:4860:4801:6a::/64", + "2001:4860:4801:6b::/64", + "2001:4860:4801:6c::/64", + "2001:4860:4801:6d::/64", + "2001:4860:4801:6e::/64", + "2001:4860:4801:6f::/64", + "2001:4860:4801:70::/64", + "2001:4860:4801:71::/64", + "2001:4860:4801:72::/64", + "2001:4860:4801:73::/64", + "2001:4860:4801:74::/64", + "2001:4860:4801:75::/64", + "2001:4860:4801:76::/64", + "2001:4860:4801:77::/64", + "2001:4860:4801:78::/64", + "2001:4860:4801:79::/64", + "2001:4860:4801:80::/64", + "2001:4860:4801:81::/64", + "2001:4860:4801:82::/64", + "2001:4860:4801:83::/64", + "2001:4860:4801:84::/64", + "2001:4860:4801:85::/64", + "2001:4860:4801:86::/64", + "2001:4860:4801:87::/64", + "2001:4860:4801:88::/64", + "2001:4860:4801:90::/64", + "2001:4860:4801:91::/64", + "2001:4860:4801:92::/64", + "2001:4860:4801:93::/64", + "2001:4860:4801:94::/64", + "2001:4860:4801:95::/64", + "2001:4860:4801:96::/64", + "2001:4860:4801:a0::/64", + "2001:4860:4801:a1::/64", + "2001:4860:4801:a2::/64", + "2001:4860:4801:a3::/64", + "2001:4860:4801:a4::/64", + "2001:4860:4801:a5::/64", + "2001:4860:4801:c::/64", + "2001:4860:4801:f::/64", + "192.178.5.0/27", + "192.178.6.0/27", + "192.178.6.128/27", + "192.178.6.160/27", + "192.178.6.192/27", + "192.178.6.32/27", + "192.178.6.64/27", + "192.178.6.96/27", + "34.100.182.96/28", + "34.101.50.144/28", + "34.118.254.0/28", + "34.118.66.0/28", + "34.126.178.96/28", + "34.146.150.144/28", + "34.147.110.144/28", + "34.151.74.144/28", + "34.152.50.64/28", + "34.154.114.144/28", + "34.155.98.32/28", + "34.165.18.176/28", + "34.175.160.64/28", + "34.176.130.16/28", + "34.22.85.0/27", + "34.64.82.64/28", + "34.65.242.112/28", + "34.80.50.80/28", + "34.88.194.0/28", + "34.89.10.80/28", + "34.89.198.80/28", + "34.96.162.48/28", + "35.247.243.240/28", + "66.249.64.0/27", + "66.249.64.128/27", + "66.249.64.160/27", + "66.249.64.224/27", + "66.249.64.32/27", + "66.249.64.64/27", + "66.249.64.96/27", + "66.249.65.0/27", + "66.249.65.128/27", + "66.249.65.160/27", + "66.249.65.192/27", + "66.249.65.224/27", + "66.249.65.32/27", + "66.249.65.64/27", + "66.249.65.96/27", + "66.249.66.0/27", + "66.249.66.128/27", + "66.249.66.160/27", + "66.249.66.192/27", + "66.249.66.224/27", + "66.249.66.32/27", + "66.249.66.64/27", + "66.249.66.96/27", + "66.249.68.0/27", + "66.249.68.128/27", + "66.249.68.32/27", + "66.249.68.64/27", + "66.249.68.96/27", + "66.249.69.0/27", + "66.249.69.128/27", + "66.249.69.160/27", + "66.249.69.192/27", + "66.249.69.224/27", + "66.249.69.32/27", + "66.249.69.64/27", + "66.249.69.96/27", + "66.249.70.0/27", + "66.249.70.128/27", + "66.249.70.160/27", + "66.249.70.192/27", + "66.249.70.224/27", + "66.249.70.32/27", + "66.249.70.64/27", + "66.249.70.96/27", + "66.249.71.0/27", + "66.249.71.128/27", + "66.249.71.160/27", + "66.249.71.192/27", + "66.249.71.224/27", + "66.249.71.32/27", + "66.249.71.64/27", + "66.249.71.96/27", + "66.249.72.0/27", + "66.249.72.128/27", + "66.249.72.160/27", + "66.249.72.192/27", + "66.249.72.224/27", + "66.249.72.32/27", + "66.249.72.64/27", + "66.249.72.96/27", + "66.249.73.0/27", + "66.249.73.128/27", + "66.249.73.160/27", + "66.249.73.192/27", + "66.249.73.224/27", + "66.249.73.32/27", + "66.249.73.64/27", + "66.249.73.96/27", + "66.249.74.0/27", + "66.249.74.128/27", + "66.249.74.160/27", + "66.249.74.192/27", + "66.249.74.32/27", + "66.249.74.64/27", + "66.249.74.96/27", + "66.249.75.0/27", + "66.249.75.128/27", + "66.249.75.160/27", + "66.249.75.192/27", + "66.249.75.224/27", + "66.249.75.32/27", + "66.249.75.64/27", + "66.249.75.96/27", + "66.249.76.0/27", + "66.249.76.128/27", + "66.249.76.160/27", + "66.249.76.192/27", + "66.249.76.224/27", + "66.249.76.32/27", + "66.249.76.64/27", + "66.249.76.96/27", + "66.249.77.0/27", + "66.249.77.128/27", + "66.249.77.160/27", + "66.249.77.192/27", + "66.249.77.224/27", + "66.249.77.32/27", + "66.249.77.64/27", + "66.249.77.96/27", + "66.249.78.0/27", + "66.249.78.32/27", + "66.249.79.0/27", + "66.249.79.128/27", + "66.249.79.160/27", + "66.249.79.192/27", + "66.249.79.224/27", + "66.249.79.32/27", + "66.249.79.64/27", + "66.249.79.96/27" + ] +- name: bingbot + user_agent_regex: \+http\://www\.bing\.com/bingbot\.htm + action: ALLOW + # https://www.bing.com/toolbox/bingbot.json + remote_addresses: [ + "157.55.39.0/24", + "207.46.13.0/24", + "40.77.167.0/24", + "13.66.139.0/24", + "13.66.144.0/24", + "52.167.144.0/24", + "13.67.10.16/28", + "13.69.66.240/28", + "13.71.172.224/28", + "139.217.52.0/28", + "191.233.204.224/28", + "20.36.108.32/28", + "20.43.120.16/28", + "40.79.131.208/28", + "40.79.186.176/28", + "52.231.148.0/28", + "20.79.107.240/28", + "51.105.67.0/28", + "20.125.163.80/28", + "40.77.188.0/22", + "65.55.210.0/24", + "199.30.24.0/23", + "40.77.202.0/24", + "40.77.139.0/25", + "20.74.197.0/28", + "20.15.133.160/27", + "40.77.177.0/24", + "40.77.178.0/23" + ] +- name: duckduckbot + user_agent_regex: DuckDuckBot/1\.1; \(\+http\://duckduckgo\.com/duckduckbot\.html\) + action: ALLOW + # https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot + remote_addresses: [ + "57.152.72.128/32", + "51.8.253.152/32", + "40.80.242.63/32", + "20.12.141.99/32", + "20.49.136.28/32", + "51.116.131.221/32", + "51.107.40.209/32", + "20.40.133.240/32", + "20.50.168.91/32", + "51.120.48.122/32", + "20.193.45.113/32", + "40.76.173.151/32", + "40.76.163.7/32", + "20.185.79.47/32", + "52.142.26.175/32", + "20.185.79.15/32", + "52.142.24.149/32", + "40.76.162.208/32", + "40.76.163.23/32", + "40.76.162.191/32", + "40.76.162.247/32", + "40.88.21.235/32", + "20.191.45.212/32", + "52.146.59.12/32", + "52.146.59.156/32", + "52.146.59.154/32", + "52.146.58.236/32", + "20.62.224.44/32", + "51.104.180.53/32", + "51.104.180.47/32", + "51.104.180.26/32", + "51.104.146.225/32", + "51.104.146.235/32", + "20.73.202.147/32", + "20.73.132.240/32", + "20.71.12.143/32", + "20.56.197.58/32", + "20.56.197.63/32", + "20.43.150.93/32", + "20.43.150.85/32", + "20.44.222.1/32", + "40.89.243.175/32", + "13.89.106.77/32", + "52.143.242.6/32", + "52.143.241.111/32", + "52.154.60.82/32", + "20.197.209.11/32", + "20.197.209.27/32", + "20.226.133.105/32", + "191.234.216.4/32", + "191.234.216.178/32", + "20.53.92.211/32", + "20.53.91.2/32", + "20.207.99.197/32", + "20.207.97.190/32", + "40.81.250.205/32", + "40.64.106.11/32", + "40.64.105.247/32", + "20.72.242.93/32", + "20.99.255.235/32", + "20.113.3.121/32", + "52.224.16.221/32", + "52.224.21.53/32", + "52.224.20.204/32", + "52.224.21.19/32", + "52.224.20.249/32", + "52.224.20.203/32", + "52.224.20.190/32", + "52.224.16.229/32", + "52.224.21.20/32", + "52.146.63.80/32", + "52.224.20.227/32", + "52.224.20.193/32", + "52.190.37.160/32", + "52.224.21.23/32", + "52.224.20.223/32", + "52.224.20.181/32", + "52.224.21.49/32", + "52.224.21.55/32", + "52.224.21.61/32", + "52.224.19.152/32", + "52.224.20.186/32", + "52.224.21.27/32", + "52.224.21.51/32", + "52.224.20.174/32", + "52.224.21.4/32", + "51.104.164.109/32", + "51.104.167.71/32", + "51.104.160.177/32", + "51.104.162.149/32", + "51.104.167.95/32", + "51.104.167.54/32", + "51.104.166.111/32", + "51.104.167.88/32", + "51.104.161.32/32", + "51.104.163.250/32", + "51.104.164.189/32", + "51.104.167.19/32", + "51.104.160.167/32", + "51.104.167.110/32", + "20.191.44.119/32", + "51.104.167.104/32", + "20.191.44.234/32", + "51.104.164.215/32", + "51.104.167.52/32", + "20.191.44.22/32", + "51.104.167.87/32", + "51.104.167.96/32", + "20.191.44.16/32", + "51.104.167.61/32", + "51.104.164.147/32", + "20.50.48.159/32", + "40.114.182.172/32", + "20.50.50.130/32", + "20.50.50.163/32", + "20.50.50.46/32", + "40.114.182.153/32", + "20.50.50.118/32", + "20.50.49.55/32", + "20.50.49.25/32", + "40.114.183.251/32", + "20.50.50.123/32", + "20.50.49.237/32", + "20.50.48.192/32", + "20.50.50.134/32", + "51.138.90.233/32", + "40.114.183.196/32", + "20.50.50.146/32", + "40.114.183.88/32", + "20.50.50.145/32", + "20.50.50.121/32", + "20.50.49.40/32", + "51.138.90.206/32", + "40.114.182.45/32", + "51.138.90.161/32", + "20.50.49.0/32", + "40.119.232.215/32", + "104.43.55.167/32", + "40.119.232.251/32", + "40.119.232.50/32", + "40.119.232.146/32", + "40.119.232.218/32", + "104.43.54.127/32", + "104.43.55.117/32", + "104.43.55.116/32", + "104.43.55.166/32", + "52.154.169.50/32", + "52.154.171.70/32", + "52.154.170.229/32", + "52.154.170.113/32", + "52.154.171.44/32", + "52.154.172.2/32", + "52.143.244.81/32", + "52.154.171.87/32", + "52.154.171.250/32", + "52.154.170.28/32", + "52.154.170.122/32", + "52.143.243.117/32", + "52.143.247.235/32", + "52.154.171.235/32", + "52.154.171.196/32", + "52.154.171.0/32", + "52.154.170.243/32", + "52.154.170.26/32", + "52.154.169.200/32", + "52.154.170.96/32", + "52.154.170.88/32", + "52.154.171.150/32", + "52.154.171.205/32", + "52.154.170.117/32", + "52.154.170.209/32", + "191.235.202.48/32", + "191.233.3.202/32", + "191.235.201.214/32", + "191.233.3.197/32", + "191.235.202.38/32", + "20.53.78.144/32", + "20.193.24.10/32", + "20.53.78.236/32", + "20.53.78.138/32", + "20.53.78.123/32", + "20.53.78.106/32", + "20.193.27.215/32", + "20.193.25.197/32", + "20.193.12.126/32", + "20.193.24.251/32", + "20.204.242.101/32", + "20.207.72.113/32", + "20.204.242.19/32", + "20.219.45.67/32", + "20.207.72.11/32", + "20.219.45.190/32", + "20.204.243.55/32", + "20.204.241.148/32", + "20.207.72.110/32", + "20.204.240.172/32", + "20.207.72.21/32", + "20.204.246.81/32", + "20.207.107.181/32", + "20.204.246.254/32", + "20.219.43.246/32", + "52.149.25.43/32", + "52.149.61.51/32", + "52.149.58.139/32", + "52.149.60.38/32", + "52.148.165.38/32", + "52.143.95.162/32", + "52.149.56.151/32", + "52.149.30.45/32", + "52.149.58.173/32", + "52.143.95.204/32", + "52.149.28.83/32", + "52.149.58.69/32", + "52.148.161.87/32", + "52.149.58.27/32", + "52.149.28.18/32", + "20.79.226.26/32", + "20.79.239.66/32", + "20.79.238.198/32", + "20.113.14.159/32", + "20.75.144.152/32", + "20.43.172.120/32", + "20.53.134.160/32", + "20.201.15.208/32", + "20.93.28.24/32", + "20.61.34.40/32", + "52.242.224.168/32", + "20.80.129.80/32", + "20.195.108.47/32", + "4.195.133.120/32", + "4.228.76.163/32", + "4.182.131.108/32", + "4.209.224.56/32", + "108.141.83.74/32", + "4.213.46.14/32", + "172.169.17.165/32", + "51.8.71.117/32", + "20.3.1.178/32", + "52.149.56.151/32", + "52.149.30.45/32", + "52.149.58.173/32", + "52.143.95.204/32", + "52.149.28.83/32", + "52.149.58.69/32", + "52.148.161.87/32", + "52.149.58.27/32", + "52.149.28.18/32", + "20.79.226.26/32", + "20.79.239.66/32", + "20.79.238.198/32", + "20.113.14.159/32", + "20.75.144.152/32", + "20.43.172.120/32", + "20.53.134.160/32", + "20.201.15.208/32", + "20.93.28.24/32", + "20.61.34.40/32", + "52.242.224.168/32", + "20.80.129.80/32", + "20.195.108.47/32", + "4.195.133.120/32", + "4.228.76.163/32", + "4.182.131.108/32", + "4.209.224.56/32", + "108.141.83.74/32", + "4.213.46.14/32", + "172.169.17.165/32", + "51.8.71.117/32", + "20.3.1.178/32" + ] +- name: qwantbot + user_agent_regex: \+https\://help\.qwant\.com/bot/ + action: ALLOW + # https://help.qwant.com/wp-content/uploads/sites/2/2025/01/qwantbot.json + remote_addresses: [ "91.242.162.0/24" ] +- name: internet-archive + action: ALLOW + # https://ipinfo.io/AS7941 + remote_addresses: [ + "207.241.224.0/20", + "208.70.24.0/21", + "2620:0:9c0::/48" + ] +- name: kagibot + user_agent_regex: \+https\://kagi\.com/bot + action: ALLOW + # https://kagi.com/bot + remote_addresses: [ + "216.18.205.234/32", + "35.212.27.76/32", + "104.254.65.50/32", + "209.151.156.194/32" + ] +- name: marginalia + user_agent_regex: search\.marginalia\.nu + action: ALLOW + # Received directly over email + remote_addresses: [ + "193.183.0.162/31", + "193.183.0.164/30", + "193.183.0.168/30", + "193.183.0.172/31", + "193.183.0.174/32" + ] +- name: mojeekbot + user_agent_regex: http\://www\.mojeek\.com/bot\.html + action: ALLOW + # https://www.mojeek.com/bot.html + remote_addresses: [ "5.102.173.71/32" ] + +# Common "keeping the internet working" routes +- name: well-known + path_regex: ^/.well-known/.*$ + action: ALLOW +- name: favicon + path_regex: ^/favicon.ico$ + action: ALLOW +- name: robots-txt + path_regex: ^/robots.txt$ + action: ALLOW + +# Punish any bot with "bot" in the user-agent string +- name: generic-bot-catchall + user_agent_regex: (?i:bot|crawler) + action: CHALLENGE + challenge: + difficulty: 16 # impossible + report_as: 4 # lie to the operator + algorithm: slow # intentionally waste CPU cycles and time + +- name: generic-browser + user_agent_regex: > + Mozilla|Opera + action: CHALLENGE + +dnsbl: false diff --git a/data/embed.go b/data/embed.go index 5a5f4d2..c1fbe68 100644 --- a/data/embed.go +++ b/data/embed.go @@ -3,6 +3,6 @@ package data import "embed" var ( - //go:embed botPolicies.json + //go:embed botPolicies.yaml botPolicies.json BotPolicies embed.FS ) diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md index 45c1f59..1c634a8 100644 --- a/docs/docs/CHANGELOG.md +++ b/docs/docs/CHANGELOG.md @@ -23,6 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added example Apache configuration to the documentation [#277](https://github.com/TecharoHQ/anubis/issues/277) - Move per-environment configuration details into their own pages - Added headers support to bot policy rules +- Moved configuration file from JSON to YAML by default - Added documentation on how to use Anubis with Traefik in Docker ## v1.16.0 diff --git a/docs/docs/admin/environments/docker-compose.mdx b/docs/docs/admin/environments/docker-compose.mdx index b40e0ea..6783808 100644 --- a/docs/docs/admin/environments/docker-compose.mdx +++ b/docs/docs/admin/environments/docker-compose.mdx @@ -12,13 +12,13 @@ services: METRICS_BIND: ":9090" SERVE_ROBOTS_TXT: "true" TARGET: "http://nginx" - POLICY_FNAME: "/data/cfg/botPolicy.json" + POLICY_FNAME: "/data/cfg/botPolicy.yaml" OG_PASSTHROUGH: "true" OG_EXPIRY_TIME: "24h" ports: - 8080:8080 volumes: - - "./botPolicy.json:/data/cfg/botPolicy.json:ro" + - "./botPolicy.yaml:/data/cfg/botPolicy.yaml:ro" nginx: image: nginx volumes: diff --git a/docs/docs/admin/installation.mdx b/docs/docs/admin/installation.mdx index 9c88930..2333b1d 100644 --- a/docs/docs/admin/installation.mdx +++ b/docs/docs/admin/installation.mdx @@ -62,7 +62,7 @@ Anubis uses these environment variables for configuration: | `METRICS_BIND_NETWORK` | `tcp` | The address family that the Anubis metrics server listens on. See `BIND_NETWORK` for more information. | | `OG_EXPIRY_TIME` | `24h` | The expiration time for the Open Graph tag cache. | | `OG_PASSTHROUGH` | `false` | If set to `true`, Anubis will enable Open Graph tag passthrough. | -| `POLICY_FNAME` | unset | The file containing [bot policy configuration](./policies.md). See the bot policy documentation for more details. If unset, the default bot policy configuration is used. | +| `POLICY_FNAME` | unset | The file containing [bot policy configuration](./policies.mdx). See the bot policy documentation for more details. If unset, the default bot policy configuration is used. | | `SERVE_ROBOTS_TXT` | `false` | If set `true`, Anubis will serve a default `robots.txt` file that disallows all known AI scrapers by name and then additionally disallows every scraper. This is useful if facts and circumstances make it difficult to change the underlying service to serve such a `robots.txt` file. | | `SOCKET_MODE` | `0770` | _Only used when at least one of the `*_BIND_NETWORK` variables are set to `unix`._ The socket mode (permissions) for Unix domain sockets. | | `TARGET` | `http://localhost:3923` | The URL of the service that Anubis should forward valid requests to. Supports Unix domain sockets, set this to a URI like so: `unix:///path/to/socket.sock`. | diff --git a/docs/docs/admin/native-install.mdx b/docs/docs/admin/native-install.mdx index 8faa5cb..a615929 100644 --- a/docs/docs/admin/native-install.mdx +++ b/docs/docs/admin/native-install.mdx @@ -86,20 +86,20 @@ Once it's installed, make a copy of the default configuration file `/etc/anubis/ sudo cp /etc/anubis/default.env /etc/anubis/gitea.env ``` -Copy the default bot policies file to `/etc/anubis/gitea.botPolicies.json`: +Copy the default bot policies file to `/etc/anubis/gitea.botPolicies.yaml`: ```text -sudo cp /usr/share/doc/anubis/botPolicies.json /etc/anubis/gitea.botPolicies.json +sudo cp /usr/share/doc/anubis/botPolicies.yaml /etc/anubis/gitea.botPolicies.yaml ``` ```text -sudo cp ./doc/botPolicies.json /etc/anubis/gitea.botPolicies.json +sudo cp ./doc/botPolicies.yaml /etc/anubis/gitea.botPolicies.yaml ``` @@ -114,7 +114,7 @@ BIND_NETWORK=tcp DIFFICULTY=4 METRICS_BIND=[::1]:8240 METRICS_BIND_NETWORK=tcp -POLICY_FNAME=/etc/anubis/gitea.botPolicies.json +POLICY_FNAME=/etc/anubis/gitea.botPolicies.yaml TARGET=http://localhost:3000 ``` diff --git a/docs/docs/admin/policies.md b/docs/docs/admin/policies.mdx similarity index 75% rename from docs/docs/admin/policies.md rename to docs/docs/admin/policies.mdx index c4034a3..a5f6f1e 100644 --- a/docs/docs/admin/policies.md +++ b/docs/docs/admin/policies.mdx @@ -2,15 +2,24 @@ title: Policy Definitions --- +import Tabs from "@theme/Tabs"; +import TabItem from "@theme/TabItem"; + Out of the box, Anubis is pretty heavy-handed. It will aggressively challenge everything that might be a browser (usually indicated by having `Mozilla` in its user agent). However, some bots are smart enough to get past the challenge. Some things that look like bots may actually be fine (IE: RSS readers). Some resources need to be visible no matter what. Some resources and remotes are fine to begin with. Bot policies let you customize the rules that Anubis uses to allow, deny, or challenge incoming requests. Currently you can set policies by the following matches: - Request path - User agent string +- HTTP request header values + +As of version v1.17.0 or later, configuration can be written in either JSON or YAML. Here's an example rule that denies [Amazonbot](https://developer.amazon.com/en/amazonbot): + + + ```json { "name": "amazonbot", @@ -19,15 +28,37 @@ Here's an example rule that denies [Amazonbot](https://developer.amazon.com/en/a } ``` + + + +```yaml +- name: amazonbot + user_agent_regex: Amazonbot + action: DENY +``` + + + + When this rule is evaluated, Anubis will check the `User-Agent` string of the request. If it contains `Amazonbot`, Anubis will send an error page to the user saying that access is denied, but in such a way that makes scrapers think they have correctly loaded the webpage. Right now the only kinds of policies you can write are bot policies. Other forms of policies will be added in the future. Here is a minimal policy file that will protect against most scraper bots: + + + ```json { "bots": [ + { + "name": "cloudflare-workers", + "headers_regex": { + "CF-Worker": ".*" + }, + "action": "DENY" + }, { "name": "well-known", "path_regex": "^/.well-known/.*$", @@ -52,6 +83,32 @@ Here is a minimal policy file that will protect against most scraper bots: } ``` + + + +```yaml +bots: + - name: cloudflare-workers + headers_regex: + CF-Worker: .* + action: DENY + - name: well-known + path_regex: ^/.well-known/.*$ + action: ALLOW + - name: favicon + path_regex: ^/favicon.ico$ + action: ALLOW + - name: robots-txt + path_regex: ^/robots.txt$ + action: ALLOW + - name: generic-browser + user_agent_regex: Mozilla + action: CHALLENGE +``` + + + + This allows requests to [`/.well-known`](https://en.wikipedia.org/wiki/Well-known_URI), `/favicon.ico`, `/robots.txt`, and challenges any request that has the word `Mozilla` in its User-Agent string. The [default policy file](https://github.com/TecharoHQ/anubis/blob/main/data/botPolicies.json) is a bit more cohesive, but this should be more than enough for most users. If no rules match the request, it is allowed through. @@ -72,6 +129,9 @@ Name your rules in lower case using kebab-case. Rule names will be exposed in Pr Rules can also have their own challenge settings. These are customized using the `"challenge"` key. For example, here is a rule that makes challenges artificially hard for connections with the substring "bot" in their user agent: + + + ```json { "name": "generic-bot-catchall", @@ -85,6 +145,23 @@ Rules can also have their own challenge settings. These are customized using the } ``` + + + +```yaml +# Punish any bot with "bot" in the user-agent string +- name: generic-bot-catchall + user_agent_regex: (?i:bot|crawler) + action: CHALLENGE + challenge: + difficulty: 16 # impossible + report_as: 4 # lie to the operator + algorithm: slow # intentionally waste CPU cycles and time +``` + + + + Challenges can be configured with these settings: | Key | Example | Description | @@ -99,6 +176,9 @@ The `remote_addresses` field of a Bot rule allows you to set the IP range that t For example, you can allow a search engine to connect if and only if its IP address matches the ones they published: + + + ```json { "name": "qwantbot", @@ -108,8 +188,25 @@ For example, you can allow a search engine to connect if and only if its IP addr } ``` + + + +```yaml +- name: qwantbot + user_agent_regex: \+https\://help\.qwant\.com/bot/ + action: ALLOW + # https://help.qwant.com/wp-content/uploads/sites/2/2025/01/qwantbot.json + remote_addresses: ["91.242.162.0/24"] +``` + + + + This also works at an IP range level without any other checks: + + + ```json { "name": "internal-network", @@ -118,6 +215,19 @@ This also works at an IP range level without any other checks: } ``` + + + +```yaml +name: internal-network +action: ALLOW +remote_addresses: + - 100.64.0.0/10 +``` + + + + ## Risk calculation for downstream services In case your service needs it for risk calculation reasons, Anubis exposes information about the rules that any requests match using a few headers: diff --git a/docs/docs/index.mdx b/docs/docs/index.mdx index 7f00850..04e3f96 100644 --- a/docs/docs/index.mdx +++ b/docs/docs/index.mdx @@ -19,7 +19,7 @@ Anubis [weighs the soul of your connection](https://en.wikipedia.org/wiki/Weighi This program is designed to help protect the small internet from the endless storm of requests that flood in from AI companies. Anubis is as lightweight as possible to ensure that everyone can afford to protect the communities closest to them. -Anubis is a bit of a nuclear response. This will result in your website being blocked from smaller scrapers and may inhibit "good bots" like the Internet Archive. You can configure [bot policy definitions](./admin/policies.md) to explicitly allowlist them and we are working on a curated set of "known good" bots to allow for a compromise between discoverability and uptime. +Anubis is a bit of a nuclear response. This will result in your website being blocked from smaller scrapers and may inhibit "good bots" like the Internet Archive. You can configure [bot policy definitions](./admin/policies.mdx) to explicitly allowlist them and we are working on a curated set of "known good" bots to allow for a compromise between discoverability and uptime. ## Support diff --git a/go.mod b/go.mod index d936bfb..aa1c5e0 100644 --- a/go.mod +++ b/go.mod @@ -45,6 +45,9 @@ require ( golang.org/x/tools v0.31.0 // indirect google.golang.org/protobuf v1.36.5 // indirect honnef.co/go/tools v0.6.1 // indirect + k8s.io/apimachinery v0.32.3 // indirect + sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect + sigs.k8s.io/yaml v1.4.0 // indirect ) tool ( diff --git a/go.sum b/go.sum index 0079081..5b32f78 100644 --- a/go.sum +++ b/go.sum @@ -138,3 +138,9 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= honnef.co/go/tools v0.6.1 h1:R094WgE8K4JirYjBaOpz/AvTyUu/3wbmAoskKN/pxTI= honnef.co/go/tools v0.6.1/go.mod h1:3puzxxljPCe8RGJX7BIy1plGbxEOZni5mR2aXe3/uk4= +k8s.io/apimachinery v0.32.3 h1:JmDuDarhDmA/Li7j3aPrwhpNBA94Nvk5zLeOge9HH1U= +k8s.io/apimachinery v0.32.3/go.mod h1:GpHVgxoKlTxClKcteaeuF1Ul/lDVb74KpZcxcmLDElE= +sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8= +sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo= +sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= +sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= diff --git a/lib/anubis.go b/lib/anubis.go index ba143f9..afc3d86 100644 --- a/lib/anubis.go +++ b/lib/anubis.go @@ -90,8 +90,8 @@ func LoadPoliciesOrDefault(fname string, defaultDifficulty int) (*policy.ParsedC return nil, fmt.Errorf("can't parse policy file %s: %w", fname, err) } } else { - fname = "(data)/botPolicies.json" - fin, err = data.BotPolicies.Open("botPolicies.json") + fname = "(data)/botPolicies.yaml" + fin, err = data.BotPolicies.Open("botPolicies.yaml") if err != nil { return nil, fmt.Errorf("[unexpected] can't parse builtin policy file %s: %w", fname, err) } diff --git a/lib/anubis_test.go b/lib/anubis_test.go index 156863d..60b7913 100644 --- a/lib/anubis_test.go +++ b/lib/anubis_test.go @@ -8,6 +8,7 @@ import ( "testing" "github.com/TecharoHQ/anubis" + "github.com/TecharoHQ/anubis/data" "github.com/TecharoHQ/anubis/internal" "github.com/TecharoHQ/anubis/lib/policy" ) @@ -15,7 +16,7 @@ import ( func loadPolicies(t *testing.T, fname string) *policy.ParsedConfig { t.Helper() - anubisPolicy, err := LoadPoliciesOrDefault("", anubis.DefaultDifficulty) + anubisPolicy, err := LoadPoliciesOrDefault(fname, anubis.DefaultDifficulty) if err != nil { t.Fatal(err) } @@ -55,6 +56,22 @@ func makeChallenge(t *testing.T, ts *httptest.Server) challenge { return chall } +func TestLoadPolicies(t *testing.T) { + for _, fname := range []string{"botPolicies.json", "botPolicies.yaml"} { + t.Run(fname, func(t *testing.T) { + fin, err := data.BotPolicies.Open(fname) + if err != nil { + t.Fatal(err) + } + defer fin.Close() + + if _, err := policy.ParseConfig(fin, fname, 4); err != nil { + t.Fatal(err) + } + }) + } +} + // Regression test for CVE-2025-24369 func TestCVE2025_24369(t *testing.T) { pol := loadPolicies(t, "") diff --git a/lib/policy/config/config_test.go b/lib/policy/config/config_test.go index 0fabbb7..4176126 100644 --- a/lib/policy/config/config_test.go +++ b/lib/policy/config/config_test.go @@ -1,11 +1,12 @@ package config import ( - "encoding/json" "errors" "os" "path/filepath" "testing" + + "k8s.io/apimachinery/pkg/util/yaml" ) func p[V any](v V) *V { return &v } @@ -219,7 +220,7 @@ func TestConfigValidKnownGood(t *testing.T) { defer fin.Close() var c Config - if err := json.NewDecoder(fin).Decode(&c); err != nil { + if err := yaml.NewYAMLToJSONDecoder(fin).Decode(&c); err != nil { t.Fatalf("can't decode file: %v", err) } @@ -246,7 +247,7 @@ func TestConfigValidBad(t *testing.T) { defer fin.Close() var c Config - if err := json.NewDecoder(fin).Decode(&c); err != nil { + if err := yaml.NewYAMLToJSONDecoder(fin).Decode(&c); err != nil { t.Fatalf("can't decode file: %v", err) } diff --git a/lib/policy/config/testdata/bad/badregexes.yaml b/lib/policy/config/testdata/bad/badregexes.yaml new file mode 100644 index 0000000..3880e40 --- /dev/null +++ b/lib/policy/config/testdata/bad/badregexes.yaml @@ -0,0 +1,7 @@ +bots: +- name: path-bad + path_regex: "a(b" + action: DENY +- name: user-agent-bad + user_agent_regex: "a(b" + action: DENY \ No newline at end of file diff --git a/lib/policy/config/testdata/bad/invalid.yaml b/lib/policy/config/testdata/bad/invalid.yaml new file mode 100644 index 0000000..18625b6 --- /dev/null +++ b/lib/policy/config/testdata/bad/invalid.yaml @@ -0,0 +1 @@ +bots: [] \ No newline at end of file diff --git a/lib/policy/config/testdata/bad/nobots.yaml b/lib/policy/config/testdata/bad/nobots.yaml new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/lib/policy/config/testdata/bad/nobots.yaml @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/lib/policy/config/testdata/good/allow_everyone.yaml b/lib/policy/config/testdata/good/allow_everyone.yaml new file mode 100644 index 0000000..5c49534 --- /dev/null +++ b/lib/policy/config/testdata/good/allow_everyone.yaml @@ -0,0 +1,6 @@ +bots: +- name: everyones-invited + remote_addresses: + - "0.0.0.0/0" + - "::/0" + action: ALLOW \ No newline at end of file diff --git a/lib/policy/config/testdata/good/block_cf_workers.yaml b/lib/policy/config/testdata/good/block_cf_workers.yaml new file mode 100644 index 0000000..c66bade --- /dev/null +++ b/lib/policy/config/testdata/good/block_cf_workers.yaml @@ -0,0 +1,5 @@ +bots: + - name: cloudflare-workers + headers_regex: + CF-Worker: .* + action: DENY \ No newline at end of file diff --git a/lib/policy/config/testdata/good/challengemozilla.yaml b/lib/policy/config/testdata/good/challengemozilla.yaml new file mode 100644 index 0000000..15922b0 --- /dev/null +++ b/lib/policy/config/testdata/good/challengemozilla.yaml @@ -0,0 +1,4 @@ +bots: +- name: generic-browser + user_agent_regex: Mozilla + action: CHALLENGE \ No newline at end of file diff --git a/lib/policy/config/testdata/good/everything_blocked.yaml b/lib/policy/config/testdata/good/everything_blocked.yaml new file mode 100644 index 0000000..323c596 --- /dev/null +++ b/lib/policy/config/testdata/good/everything_blocked.yaml @@ -0,0 +1,4 @@ +bots: +- name: everything + user_agent_regex: .* + action: DENY diff --git a/lib/policy/policy.go b/lib/policy/policy.go index 4451b08..2d610c8 100644 --- a/lib/policy/policy.go +++ b/lib/policy/policy.go @@ -1,7 +1,6 @@ package policy import ( - "encoding/json" "errors" "fmt" "io" @@ -11,6 +10,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" "github.com/yl2chen/cidranger" + "k8s.io/apimachinery/pkg/util/yaml" "github.com/TecharoHQ/anubis/lib/policy/config" ) @@ -38,8 +38,8 @@ func NewParsedConfig(orig config.Config) *ParsedConfig { func ParseConfig(fin io.Reader, fname string, defaultDifficulty int) (*ParsedConfig, error) { var c config.Config - if err := json.NewDecoder(fin).Decode(&c); err != nil { - return nil, fmt.Errorf("can't parse policy config JSON %s: %w", fname, err) + if err := yaml.NewYAMLToJSONDecoder(fin).Decode(&c); err != nil { + return nil, fmt.Errorf("can't parse policy config YAML %s: %w", fname, err) } if err := c.Valid(); err != nil {