diff --git a/cmd/anubis/main.go b/cmd/anubis/main.go index b7375ea..f47acec 100644 --- a/cmd/anubis/main.go +++ b/cmd/anubis/main.go @@ -27,6 +27,7 @@ import ( "time" "github.com/TecharoHQ/anubis" + "github.com/TecharoHQ/anubis/data" "github.com/TecharoHQ/anubis/internal" libanubis "github.com/TecharoHQ/anubis/lib" botPolicy "github.com/TecharoHQ/anubis/lib/policy" @@ -184,6 +185,9 @@ func main() { } if *extractResources != "" { + if err := extractEmbedFS(data.BotPolicies, ".", *extractResources); err != nil { + log.Fatal(err) + } if err := extractEmbedFS(web.Static, "static", *extractResources); err != nil { log.Fatal(err) } @@ -347,7 +351,7 @@ func extractEmbedFS(fsys embed.FS, root string, destDir string) error { return err } - destPath := filepath.Join(destDir, relPath) + destPath := filepath.Join(destDir, root, relPath) if d.IsDir() { return os.MkdirAll(destPath, 0o700) diff --git a/data/apps/gitea-rss-feeds.yaml b/data/apps/gitea-rss-feeds.yaml new file mode 100644 index 0000000..7bd34ce --- /dev/null +++ b/data/apps/gitea-rss-feeds.yaml @@ -0,0 +1,7 @@ +# By Aibrew: https://github.com/TecharoHQ/anubis/discussions/261#discussioncomment-12821065 +- name: gitea-feed-atom + action: ALLOW + path_regex: ^/[.A-Za-z0-9_-]{1,256}?[./A-Za-z0-9_-]*\.atom$ +- name: gitea-feed-rss + action: ALLOW + path_regex: ^/[.A-Za-z0-9_-]{1,256}?[./A-Za-z0-9_-]*\.rss$ \ No newline at end of file diff --git a/data/botPolicies.json b/data/botPolicies.json index 72d38dc..dad04e8 100644 --- a/data/botPolicies.json +++ b/data/botPolicies.json @@ -1,678 +1,47 @@ { "bots": [ { - "name": "cloudflare-workers", - "headers_regex": { - "CF-Worker": ".*" - }, - "action": "DENY" + "import": "(data)/bots/ai-robots-txt.yaml" }, { - "name": "ai-robots-txt", - "user_agent_regex": "AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|Perplexity-User|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot", - "action": "DENY" + "import": "(data)/bots/cloudflare-workers.yaml" }, { - "name": "googlebot", - "user_agent_regex": "\\+http\\://www\\.google\\.com/bot\\.html", - "action": "ALLOW", - "remote_addresses": [ - "2001:4860:4801:10::/64", - "2001:4860:4801:11::/64", - "2001:4860:4801:12::/64", - "2001:4860:4801:13::/64", - "2001:4860:4801:14::/64", - "2001:4860:4801:15::/64", - "2001:4860:4801:16::/64", - "2001:4860:4801:17::/64", - "2001:4860:4801:18::/64", - "2001:4860:4801:19::/64", - "2001:4860:4801:1a::/64", - "2001:4860:4801:1b::/64", - "2001:4860:4801:1c::/64", - "2001:4860:4801:1d::/64", - "2001:4860:4801:1e::/64", - "2001:4860:4801:1f::/64", - "2001:4860:4801:20::/64", - "2001:4860:4801:21::/64", - "2001:4860:4801:22::/64", - "2001:4860:4801:23::/64", - "2001:4860:4801:24::/64", - "2001:4860:4801:25::/64", - "2001:4860:4801:26::/64", - "2001:4860:4801:27::/64", - "2001:4860:4801:28::/64", - "2001:4860:4801:29::/64", - "2001:4860:4801:2::/64", - "2001:4860:4801:2a::/64", - "2001:4860:4801:2b::/64", - "2001:4860:4801:2c::/64", - "2001:4860:4801:2d::/64", - "2001:4860:4801:2e::/64", - "2001:4860:4801:2f::/64", - "2001:4860:4801:31::/64", - "2001:4860:4801:32::/64", - "2001:4860:4801:33::/64", - "2001:4860:4801:34::/64", - "2001:4860:4801:35::/64", - "2001:4860:4801:36::/64", - "2001:4860:4801:37::/64", - "2001:4860:4801:38::/64", - "2001:4860:4801:39::/64", - "2001:4860:4801:3a::/64", - "2001:4860:4801:3b::/64", - "2001:4860:4801:3c::/64", - "2001:4860:4801:3d::/64", - "2001:4860:4801:3e::/64", - "2001:4860:4801:40::/64", - "2001:4860:4801:41::/64", - "2001:4860:4801:42::/64", - "2001:4860:4801:43::/64", - "2001:4860:4801:44::/64", - "2001:4860:4801:45::/64", - "2001:4860:4801:46::/64", - "2001:4860:4801:47::/64", - "2001:4860:4801:48::/64", - "2001:4860:4801:49::/64", - "2001:4860:4801:4a::/64", - "2001:4860:4801:4b::/64", - "2001:4860:4801:4c::/64", - "2001:4860:4801:50::/64", - "2001:4860:4801:51::/64", - "2001:4860:4801:52::/64", - "2001:4860:4801:53::/64", - "2001:4860:4801:54::/64", - "2001:4860:4801:55::/64", - "2001:4860:4801:56::/64", - "2001:4860:4801:60::/64", - "2001:4860:4801:61::/64", - "2001:4860:4801:62::/64", - "2001:4860:4801:63::/64", - "2001:4860:4801:64::/64", - "2001:4860:4801:65::/64", - "2001:4860:4801:66::/64", - "2001:4860:4801:67::/64", - "2001:4860:4801:68::/64", - "2001:4860:4801:69::/64", - "2001:4860:4801:6a::/64", - "2001:4860:4801:6b::/64", - "2001:4860:4801:6c::/64", - "2001:4860:4801:6d::/64", - "2001:4860:4801:6e::/64", - "2001:4860:4801:6f::/64", - "2001:4860:4801:70::/64", - "2001:4860:4801:71::/64", - "2001:4860:4801:72::/64", - "2001:4860:4801:73::/64", - "2001:4860:4801:74::/64", - "2001:4860:4801:75::/64", - "2001:4860:4801:76::/64", - "2001:4860:4801:77::/64", - "2001:4860:4801:78::/64", - "2001:4860:4801:79::/64", - "2001:4860:4801:80::/64", - "2001:4860:4801:81::/64", - "2001:4860:4801:82::/64", - "2001:4860:4801:83::/64", - "2001:4860:4801:84::/64", - "2001:4860:4801:85::/64", - "2001:4860:4801:86::/64", - "2001:4860:4801:87::/64", - "2001:4860:4801:88::/64", - "2001:4860:4801:90::/64", - "2001:4860:4801:91::/64", - "2001:4860:4801:92::/64", - "2001:4860:4801:93::/64", - "2001:4860:4801:94::/64", - "2001:4860:4801:95::/64", - "2001:4860:4801:96::/64", - "2001:4860:4801:a0::/64", - "2001:4860:4801:a1::/64", - "2001:4860:4801:a2::/64", - "2001:4860:4801:a3::/64", - "2001:4860:4801:a4::/64", - "2001:4860:4801:a5::/64", - "2001:4860:4801:c::/64", - "2001:4860:4801:f::/64", - "192.178.5.0/27", - "192.178.6.0/27", - "192.178.6.128/27", - "192.178.6.160/27", - "192.178.6.192/27", - "192.178.6.32/27", - "192.178.6.64/27", - "192.178.6.96/27", - "34.100.182.96/28", - "34.101.50.144/28", - "34.118.254.0/28", - "34.118.66.0/28", - "34.126.178.96/28", - "34.146.150.144/28", - "34.147.110.144/28", - "34.151.74.144/28", - "34.152.50.64/28", - "34.154.114.144/28", - "34.155.98.32/28", - "34.165.18.176/28", - "34.175.160.64/28", - "34.176.130.16/28", - "34.22.85.0/27", - "34.64.82.64/28", - "34.65.242.112/28", - "34.80.50.80/28", - "34.88.194.0/28", - "34.89.10.80/28", - "34.89.198.80/28", - "34.96.162.48/28", - "35.247.243.240/28", - "66.249.64.0/27", - "66.249.64.128/27", - "66.249.64.160/27", - "66.249.64.224/27", - "66.249.64.32/27", - "66.249.64.64/27", - "66.249.64.96/27", - "66.249.65.0/27", - "66.249.65.128/27", - "66.249.65.160/27", - "66.249.65.192/27", - "66.249.65.224/27", - "66.249.65.32/27", - "66.249.65.64/27", - "66.249.65.96/27", - "66.249.66.0/27", - "66.249.66.128/27", - "66.249.66.160/27", - "66.249.66.192/27", - "66.249.66.224/27", - "66.249.66.32/27", - "66.249.66.64/27", - "66.249.66.96/27", - "66.249.68.0/27", - "66.249.68.128/27", - "66.249.68.32/27", - "66.249.68.64/27", - "66.249.68.96/27", - "66.249.69.0/27", - "66.249.69.128/27", - "66.249.69.160/27", - "66.249.69.192/27", - "66.249.69.224/27", - "66.249.69.32/27", - "66.249.69.64/27", - "66.249.69.96/27", - "66.249.70.0/27", - "66.249.70.128/27", - "66.249.70.160/27", - "66.249.70.192/27", - "66.249.70.224/27", - "66.249.70.32/27", - "66.249.70.64/27", - "66.249.70.96/27", - "66.249.71.0/27", - "66.249.71.128/27", - "66.249.71.160/27", - "66.249.71.192/27", - "66.249.71.224/27", - "66.249.71.32/27", - "66.249.71.64/27", - "66.249.71.96/27", - "66.249.72.0/27", - "66.249.72.128/27", - "66.249.72.160/27", - "66.249.72.192/27", - "66.249.72.224/27", - "66.249.72.32/27", - "66.249.72.64/27", - "66.249.72.96/27", - "66.249.73.0/27", - "66.249.73.128/27", - "66.249.73.160/27", - "66.249.73.192/27", - "66.249.73.224/27", - "66.249.73.32/27", - "66.249.73.64/27", - "66.249.73.96/27", - "66.249.74.0/27", - "66.249.74.128/27", - "66.249.74.160/27", - "66.249.74.192/27", - "66.249.74.32/27", - "66.249.74.64/27", - "66.249.74.96/27", - "66.249.75.0/27", - "66.249.75.128/27", - "66.249.75.160/27", - "66.249.75.192/27", - "66.249.75.224/27", - "66.249.75.32/27", - "66.249.75.64/27", - "66.249.75.96/27", - "66.249.76.0/27", - "66.249.76.128/27", - "66.249.76.160/27", - "66.249.76.192/27", - "66.249.76.224/27", - "66.249.76.32/27", - "66.249.76.64/27", - "66.249.76.96/27", - "66.249.77.0/27", - "66.249.77.128/27", - "66.249.77.160/27", - "66.249.77.192/27", - "66.249.77.224/27", - "66.249.77.32/27", - "66.249.77.64/27", - "66.249.77.96/27", - "66.249.78.0/27", - "66.249.78.32/27", - "66.249.79.0/27", - "66.249.79.128/27", - "66.249.79.160/27", - "66.249.79.192/27", - "66.249.79.224/27", - "66.249.79.32/27", - "66.249.79.64/27", - "66.249.79.96/27" - ] + "import": "(data)/bots/headless-browsers.yaml" }, { - "name": "bingbot", - "user_agent_regex": "\\+http\\://www\\.bing\\.com/bingbot\\.htm", - "action": "ALLOW", - "remote_addresses": [ - "157.55.39.0/24", - "207.46.13.0/24", - "40.77.167.0/24", - "13.66.139.0/24", - "13.66.144.0/24", - "52.167.144.0/24", - "13.67.10.16/28", - "13.69.66.240/28", - "13.71.172.224/28", - "139.217.52.0/28", - "191.233.204.224/28", - "20.36.108.32/28", - "20.43.120.16/28", - "40.79.131.208/28", - "40.79.186.176/28", - "52.231.148.0/28", - "20.79.107.240/28", - "51.105.67.0/28", - "20.125.163.80/28", - "40.77.188.0/22", - "65.55.210.0/24", - "199.30.24.0/23", - "40.77.202.0/24", - "40.77.139.0/25", - "20.74.197.0/28", - "20.15.133.160/27", - "40.77.177.0/24", - "40.77.178.0/23" - ] + "import": "(data)/bots/us-ai-scraper.yaml" }, { - "name": "duckduckbot", - "user_agent_regex": "\\+http\\://duckduckgo\\.com/duckduckbot\\.html", - "action": "ALLOW", - "remote_addresses": [ - "57.152.72.128/32", - "51.8.253.152/32", - "40.80.242.63/32", - "20.12.141.99/32", - "20.49.136.28/32", - "51.116.131.221/32", - "51.107.40.209/32", - "20.40.133.240/32", - "20.50.168.91/32", - "51.120.48.122/32", - "20.193.45.113/32", - "40.76.173.151/32", - "40.76.163.7/32", - "20.185.79.47/32", - "52.142.26.175/32", - "20.185.79.15/32", - "52.142.24.149/32", - "40.76.162.208/32", - "40.76.163.23/32", - "40.76.162.191/32", - "40.76.162.247/32", - "40.88.21.235/32", - "20.191.45.212/32", - "52.146.59.12/32", - "52.146.59.156/32", - "52.146.59.154/32", - "52.146.58.236/32", - "20.62.224.44/32", - "51.104.180.53/32", - "51.104.180.47/32", - "51.104.180.26/32", - "51.104.146.225/32", - "51.104.146.235/32", - "20.73.202.147/32", - "20.73.132.240/32", - "20.71.12.143/32", - "20.56.197.58/32", - "20.56.197.63/32", - "20.43.150.93/32", - "20.43.150.85/32", - "20.44.222.1/32", - "40.89.243.175/32", - "13.89.106.77/32", - "52.143.242.6/32", - "52.143.241.111/32", - "52.154.60.82/32", - "20.197.209.11/32", - "20.197.209.27/32", - "20.226.133.105/32", - "191.234.216.4/32", - "191.234.216.178/32", - "20.53.92.211/32", - "20.53.91.2/32", - "20.207.99.197/32", - "20.207.97.190/32", - "40.81.250.205/32", - "40.64.106.11/32", - "40.64.105.247/32", - "20.72.242.93/32", - "20.99.255.235/32", - "20.113.3.121/32", - "52.224.16.221/32", - "52.224.21.53/32", - "52.224.20.204/32", - "52.224.21.19/32", - "52.224.20.249/32", - "52.224.20.203/32", - "52.224.20.190/32", - "52.224.16.229/32", - "52.224.21.20/32", - "52.146.63.80/32", - "52.224.20.227/32", - "52.224.20.193/32", - "52.190.37.160/32", - "52.224.21.23/32", - "52.224.20.223/32", - "52.224.20.181/32", - "52.224.21.49/32", - "52.224.21.55/32", - "52.224.21.61/32", - "52.224.19.152/32", - "52.224.20.186/32", - "52.224.21.27/32", - "52.224.21.51/32", - "52.224.20.174/32", - "52.224.21.4/32", - "51.104.164.109/32", - "51.104.167.71/32", - "51.104.160.177/32", - "51.104.162.149/32", - "51.104.167.95/32", - "51.104.167.54/32", - "51.104.166.111/32", - "51.104.167.88/32", - "51.104.161.32/32", - "51.104.163.250/32", - "51.104.164.189/32", - "51.104.167.19/32", - "51.104.160.167/32", - "51.104.167.110/32", - "20.191.44.119/32", - "51.104.167.104/32", - "20.191.44.234/32", - "51.104.164.215/32", - "51.104.167.52/32", - "20.191.44.22/32", - "51.104.167.87/32", - "51.104.167.96/32", - "20.191.44.16/32", - "51.104.167.61/32", - "51.104.164.147/32", - "20.50.48.159/32", - "40.114.182.172/32", - "20.50.50.130/32", - "20.50.50.163/32", - "20.50.50.46/32", - "40.114.182.153/32", - "20.50.50.118/32", - "20.50.49.55/32", - "20.50.49.25/32", - "40.114.183.251/32", - "20.50.50.123/32", - "20.50.49.237/32", - "20.50.48.192/32", - "20.50.50.134/32", - "51.138.90.233/32", - "40.114.183.196/32", - "20.50.50.146/32", - "40.114.183.88/32", - "20.50.50.145/32", - "20.50.50.121/32", - "20.50.49.40/32", - "51.138.90.206/32", - "40.114.182.45/32", - "51.138.90.161/32", - "20.50.49.0/32", - "40.119.232.215/32", - "104.43.55.167/32", - "40.119.232.251/32", - "40.119.232.50/32", - "40.119.232.146/32", - "40.119.232.218/32", - "104.43.54.127/32", - "104.43.55.117/32", - "104.43.55.116/32", - "104.43.55.166/32", - "52.154.169.50/32", - "52.154.171.70/32", - "52.154.170.229/32", - "52.154.170.113/32", - "52.154.171.44/32", - "52.154.172.2/32", - "52.143.244.81/32", - "52.154.171.87/32", - "52.154.171.250/32", - "52.154.170.28/32", - "52.154.170.122/32", - "52.143.243.117/32", - "52.143.247.235/32", - "52.154.171.235/32", - "52.154.171.196/32", - "52.154.171.0/32", - "52.154.170.243/32", - "52.154.170.26/32", - "52.154.169.200/32", - "52.154.170.96/32", - "52.154.170.88/32", - "52.154.171.150/32", - "52.154.171.205/32", - "52.154.170.117/32", - "52.154.170.209/32", - "191.235.202.48/32", - "191.233.3.202/32", - "191.235.201.214/32", - "191.233.3.197/32", - "191.235.202.38/32", - "20.53.78.144/32", - "20.193.24.10/32", - "20.53.78.236/32", - "20.53.78.138/32", - "20.53.78.123/32", - "20.53.78.106/32", - "20.193.27.215/32", - "20.193.25.197/32", - "20.193.12.126/32", - "20.193.24.251/32", - "20.204.242.101/32", - "20.207.72.113/32", - "20.204.242.19/32", - "20.219.45.67/32", - "20.207.72.11/32", - "20.219.45.190/32", - "20.204.243.55/32", - "20.204.241.148/32", - "20.207.72.110/32", - "20.204.240.172/32", - "20.207.72.21/32", - "20.204.246.81/32", - "20.207.107.181/32", - "20.204.246.254/32", - "20.219.43.246/32", - "52.149.25.43/32", - "52.149.61.51/32", - "52.149.58.139/32", - "52.149.60.38/32", - "52.148.165.38/32", - "52.143.95.162/32", - "52.149.56.151/32", - "52.149.30.45/32", - "52.149.58.173/32", - "52.143.95.204/32", - "52.149.28.83/32", - "52.149.58.69/32", - "52.148.161.87/32", - "52.149.58.27/32", - "52.149.28.18/32", - "20.79.226.26/32", - "20.79.239.66/32", - "20.79.238.198/32", - "20.113.14.159/32", - "20.75.144.152/32", - "20.43.172.120/32", - "20.53.134.160/32", - "20.201.15.208/32", - "20.93.28.24/32", - "20.61.34.40/32", - "52.242.224.168/32", - "20.80.129.80/32", - "20.195.108.47/32", - "4.195.133.120/32", - "4.228.76.163/32", - "4.182.131.108/32", - "4.209.224.56/32", - "108.141.83.74/32", - "4.213.46.14/32", - "172.169.17.165/32", - "51.8.71.117/32", - "20.3.1.178/32", - "52.149.56.151/32", - "52.149.30.45/32", - "52.149.58.173/32", - "52.143.95.204/32", - "52.149.28.83/32", - "52.149.58.69/32", - "52.148.161.87/32", - "52.149.58.27/32", - "52.149.28.18/32", - "20.79.226.26/32", - "20.79.239.66/32", - "20.79.238.198/32", - "20.113.14.159/32", - "20.75.144.152/32", - "20.43.172.120/32", - "20.53.134.160/32", - "20.201.15.208/32", - "20.93.28.24/32", - "20.61.34.40/32", - "52.242.224.168/32", - "20.80.129.80/32", - "20.195.108.47/32", - "4.195.133.120/32", - "4.228.76.163/32", - "4.182.131.108/32", - "4.209.224.56/32", - "108.141.83.74/32", - "4.213.46.14/32", - "172.169.17.165/32", - "51.8.71.117/32", - "20.3.1.178/32" - ] + "import": "(data)/crawlers/googlebot.yaml" }, { - "name": "qwantbot", - "user_agent_regex": "\\+https\\://help\\.qwant\\.com/bot/", - "action": "ALLOW", - "remote_addresses": [ - "91.242.162.0/24" - ] + "import": "(data)/crawlers/bingbot.yaml" }, { - "name": "internet-archive", - "action": "ALLOW", - "remote_addresses": [ - "207.241.224.0/20", - "208.70.24.0/21", - "2620:0:9c0::/48" - ] + "import": "(data)/crawlers/duckduckbot.yaml" }, { - "name": "kagibot", - "user_agent_regex": "\\+https\\://kagi\\.com/bot", - "action": "ALLOW", - "remote_addresses": [ - "216.18.205.234/32", - "35.212.27.76/32", - "104.254.65.50/32", - "209.151.156.194/32" - ] + "import": "(data)/crawlers/qwantbot.yaml" }, { - "name": "marginalia", - "user_agent_regex": "search\\.marginalia\\.nu", - "action": "ALLOW", - "remote_addresses": [ - "193.183.0.162/31", - "193.183.0.164/30", - "193.183.0.168/30", - "193.183.0.172/31", - "193.183.0.174/32" - ] + "import": "(data)/crawlers/internet-archive.yaml" }, { - "name": "mojeekbot", - "user_agent_regex": "http\\://www\\.mojeek\\.com/bot\\.html", - "action": "ALLOW", - "remote_addresses": [ - "5.102.173.71/32" - ] + "import": "(data)/crawlers/kagibot.yaml" }, { - "name": "us-artificial-intelligence-scraper", - "user_agent_regex": "\\+https\\://github\\.com/US-Artificial-Intelligence/scraper", - "action": "DENY" + "import": "(data)/crawlers/marginalia.yaml" }, { - "name": "well-known", - "path_regex": "^/.well-known/.*$", - "action": "ALLOW" + "import": "(data)/crawlers/mojeekbot.yaml" }, { - "name": "favicon", - "path_regex": "^/favicon.ico$", - "action": "ALLOW" - }, - { - "name": "robots-txt", - "path_regex": "^/robots.txt$", - "action": "ALLOW" - }, - { - "name": "lightpanda", - "user_agent_regex": "^Lightpanda/.*$", - "action": "DENY" - }, - { - "name": "headless-chrome", - "user_agent_regex": "HeadlessChrome", - "action": "DENY" - }, - { - "name": "headless-chromium", - "user_agent_regex": "HeadlessChromium", - "action": "DENY" + "import": "(data)/common/keep-internet-working.yaml" }, { "name": "generic-browser", - "user_agent_regex": "Mozilla|Opera", + "user_agent_regex": "Mozilla|Opera\n", "action": "CHALLENGE" } ], diff --git a/data/botPolicies.yaml b/data/botPolicies.yaml index cb4715a..585be15 100644 --- a/data/botPolicies.yaml +++ b/data/botPolicies.yaml @@ -1,651 +1,38 @@ +## Anubis has the ability to let you import snippets of configuration into the main +## configuration file. This allows you to break up your config into smaller parts +## that get logically assembled into one big file. +## +## Of note, a bot rule can either have inline bot configuration or import a +## bot config snippet. You cannot do both in a single bot rule. +## +## Import paths can either be prefixed with (data) to import from the common/shared +## rules in the data folder in the Anubis source tree or will point to absolute/relative +## paths in your filesystem. If you don't have access to the Anubis source tree, check +## /usr/share/docs/anubis/data or in the tarball you extracted Anubis from. + bots: # Pathological bots to deny -- name: us-artificial-intelligence-scraper - user_agent_regex: \+https\://github\.com/US-Artificial-Intelligence/scraper - action: DENY -- name: lightpanda - user_agent_regex: ^LightPanda/.*$ - action: DENY -- name: headless-chrome - user_agent_regex: HeadlessChrome - action: DENY -- name: headless-chromium - user_agent_regex: HeadlessChromium - action: DENY -- name: "ai-robots-txt" - user_agent_regex: > - AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|Perplexity-User|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot - action: DENY -- name: cloudflare-workers - headers_regex: - CF-Worker: .* - action: DENY +- # This correlates to data/bots/ai-robots-txt.yaml in the source tree + import: (data)/bots/ai-robots-txt.yaml +- import: (data)/bots/cloudflare-workers.yaml +- import: (data)/bots/headless-browsers.yaml +- import: (data)/bots/us-ai-scraper.yaml -# search engines to allow -- name: googlebot - user_agent_regex: \+http\://www\.google\.com/bot\.html - action: ALLOW - # https://developers.google.com/static/search/apis/ipranges/googlebot.json - remote_addresses: [ - "2001:4860:4801:10::/64", - "2001:4860:4801:11::/64", - "2001:4860:4801:12::/64", - "2001:4860:4801:13::/64", - "2001:4860:4801:14::/64", - "2001:4860:4801:15::/64", - "2001:4860:4801:16::/64", - "2001:4860:4801:17::/64", - "2001:4860:4801:18::/64", - "2001:4860:4801:19::/64", - "2001:4860:4801:1a::/64", - "2001:4860:4801:1b::/64", - "2001:4860:4801:1c::/64", - "2001:4860:4801:1d::/64", - "2001:4860:4801:1e::/64", - "2001:4860:4801:1f::/64", - "2001:4860:4801:20::/64", - "2001:4860:4801:21::/64", - "2001:4860:4801:22::/64", - "2001:4860:4801:23::/64", - "2001:4860:4801:24::/64", - "2001:4860:4801:25::/64", - "2001:4860:4801:26::/64", - "2001:4860:4801:27::/64", - "2001:4860:4801:28::/64", - "2001:4860:4801:29::/64", - "2001:4860:4801:2::/64", - "2001:4860:4801:2a::/64", - "2001:4860:4801:2b::/64", - "2001:4860:4801:2c::/64", - "2001:4860:4801:2d::/64", - "2001:4860:4801:2e::/64", - "2001:4860:4801:2f::/64", - "2001:4860:4801:31::/64", - "2001:4860:4801:32::/64", - "2001:4860:4801:33::/64", - "2001:4860:4801:34::/64", - "2001:4860:4801:35::/64", - "2001:4860:4801:36::/64", - "2001:4860:4801:37::/64", - "2001:4860:4801:38::/64", - "2001:4860:4801:39::/64", - "2001:4860:4801:3a::/64", - "2001:4860:4801:3b::/64", - "2001:4860:4801:3c::/64", - "2001:4860:4801:3d::/64", - "2001:4860:4801:3e::/64", - "2001:4860:4801:40::/64", - "2001:4860:4801:41::/64", - "2001:4860:4801:42::/64", - "2001:4860:4801:43::/64", - "2001:4860:4801:44::/64", - "2001:4860:4801:45::/64", - "2001:4860:4801:46::/64", - "2001:4860:4801:47::/64", - "2001:4860:4801:48::/64", - "2001:4860:4801:49::/64", - "2001:4860:4801:4a::/64", - "2001:4860:4801:4b::/64", - "2001:4860:4801:4c::/64", - "2001:4860:4801:50::/64", - "2001:4860:4801:51::/64", - "2001:4860:4801:52::/64", - "2001:4860:4801:53::/64", - "2001:4860:4801:54::/64", - "2001:4860:4801:55::/64", - "2001:4860:4801:56::/64", - "2001:4860:4801:60::/64", - "2001:4860:4801:61::/64", - "2001:4860:4801:62::/64", - "2001:4860:4801:63::/64", - "2001:4860:4801:64::/64", - "2001:4860:4801:65::/64", - "2001:4860:4801:66::/64", - "2001:4860:4801:67::/64", - "2001:4860:4801:68::/64", - "2001:4860:4801:69::/64", - "2001:4860:4801:6a::/64", - "2001:4860:4801:6b::/64", - "2001:4860:4801:6c::/64", - "2001:4860:4801:6d::/64", - "2001:4860:4801:6e::/64", - "2001:4860:4801:6f::/64", - "2001:4860:4801:70::/64", - "2001:4860:4801:71::/64", - "2001:4860:4801:72::/64", - "2001:4860:4801:73::/64", - "2001:4860:4801:74::/64", - "2001:4860:4801:75::/64", - "2001:4860:4801:76::/64", - "2001:4860:4801:77::/64", - "2001:4860:4801:78::/64", - "2001:4860:4801:79::/64", - "2001:4860:4801:80::/64", - "2001:4860:4801:81::/64", - "2001:4860:4801:82::/64", - "2001:4860:4801:83::/64", - "2001:4860:4801:84::/64", - "2001:4860:4801:85::/64", - "2001:4860:4801:86::/64", - "2001:4860:4801:87::/64", - "2001:4860:4801:88::/64", - "2001:4860:4801:90::/64", - "2001:4860:4801:91::/64", - "2001:4860:4801:92::/64", - "2001:4860:4801:93::/64", - "2001:4860:4801:94::/64", - "2001:4860:4801:95::/64", - "2001:4860:4801:96::/64", - "2001:4860:4801:a0::/64", - "2001:4860:4801:a1::/64", - "2001:4860:4801:a2::/64", - "2001:4860:4801:a3::/64", - "2001:4860:4801:a4::/64", - "2001:4860:4801:a5::/64", - "2001:4860:4801:c::/64", - "2001:4860:4801:f::/64", - "192.178.5.0/27", - "192.178.6.0/27", - "192.178.6.128/27", - "192.178.6.160/27", - "192.178.6.192/27", - "192.178.6.32/27", - "192.178.6.64/27", - "192.178.6.96/27", - "34.100.182.96/28", - "34.101.50.144/28", - "34.118.254.0/28", - "34.118.66.0/28", - "34.126.178.96/28", - "34.146.150.144/28", - "34.147.110.144/28", - "34.151.74.144/28", - "34.152.50.64/28", - "34.154.114.144/28", - "34.155.98.32/28", - "34.165.18.176/28", - "34.175.160.64/28", - "34.176.130.16/28", - "34.22.85.0/27", - "34.64.82.64/28", - "34.65.242.112/28", - "34.80.50.80/28", - "34.88.194.0/28", - "34.89.10.80/28", - "34.89.198.80/28", - "34.96.162.48/28", - "35.247.243.240/28", - "66.249.64.0/27", - "66.249.64.128/27", - "66.249.64.160/27", - "66.249.64.224/27", - "66.249.64.32/27", - "66.249.64.64/27", - "66.249.64.96/27", - "66.249.65.0/27", - "66.249.65.128/27", - "66.249.65.160/27", - "66.249.65.192/27", - "66.249.65.224/27", - "66.249.65.32/27", - "66.249.65.64/27", - "66.249.65.96/27", - "66.249.66.0/27", - "66.249.66.128/27", - "66.249.66.160/27", - "66.249.66.192/27", - "66.249.66.224/27", - "66.249.66.32/27", - "66.249.66.64/27", - "66.249.66.96/27", - "66.249.68.0/27", - "66.249.68.128/27", - "66.249.68.32/27", - "66.249.68.64/27", - "66.249.68.96/27", - "66.249.69.0/27", - "66.249.69.128/27", - "66.249.69.160/27", - "66.249.69.192/27", - "66.249.69.224/27", - "66.249.69.32/27", - "66.249.69.64/27", - "66.249.69.96/27", - "66.249.70.0/27", - "66.249.70.128/27", - "66.249.70.160/27", - "66.249.70.192/27", - "66.249.70.224/27", - "66.249.70.32/27", - "66.249.70.64/27", - "66.249.70.96/27", - "66.249.71.0/27", - "66.249.71.128/27", - "66.249.71.160/27", - "66.249.71.192/27", - "66.249.71.224/27", - "66.249.71.32/27", - "66.249.71.64/27", - "66.249.71.96/27", - "66.249.72.0/27", - "66.249.72.128/27", - "66.249.72.160/27", - "66.249.72.192/27", - "66.249.72.224/27", - "66.249.72.32/27", - "66.249.72.64/27", - "66.249.72.96/27", - "66.249.73.0/27", - "66.249.73.128/27", - "66.249.73.160/27", - "66.249.73.192/27", - "66.249.73.224/27", - "66.249.73.32/27", - "66.249.73.64/27", - "66.249.73.96/27", - "66.249.74.0/27", - "66.249.74.128/27", - "66.249.74.160/27", - "66.249.74.192/27", - "66.249.74.32/27", - "66.249.74.64/27", - "66.249.74.96/27", - "66.249.75.0/27", - "66.249.75.128/27", - "66.249.75.160/27", - "66.249.75.192/27", - "66.249.75.224/27", - "66.249.75.32/27", - "66.249.75.64/27", - "66.249.75.96/27", - "66.249.76.0/27", - "66.249.76.128/27", - "66.249.76.160/27", - "66.249.76.192/27", - "66.249.76.224/27", - "66.249.76.32/27", - "66.249.76.64/27", - "66.249.76.96/27", - "66.249.77.0/27", - "66.249.77.128/27", - "66.249.77.160/27", - "66.249.77.192/27", - "66.249.77.224/27", - "66.249.77.32/27", - "66.249.77.64/27", - "66.249.77.96/27", - "66.249.78.0/27", - "66.249.78.32/27", - "66.249.79.0/27", - "66.249.79.128/27", - "66.249.79.160/27", - "66.249.79.192/27", - "66.249.79.224/27", - "66.249.79.32/27", - "66.249.79.64/27", - "66.249.79.96/27" - ] -- name: bingbot - user_agent_regex: \+http\://www\.bing\.com/bingbot\.htm - action: ALLOW - # https://www.bing.com/toolbox/bingbot.json - remote_addresses: [ - "157.55.39.0/24", - "207.46.13.0/24", - "40.77.167.0/24", - "13.66.139.0/24", - "13.66.144.0/24", - "52.167.144.0/24", - "13.67.10.16/28", - "13.69.66.240/28", - "13.71.172.224/28", - "139.217.52.0/28", - "191.233.204.224/28", - "20.36.108.32/28", - "20.43.120.16/28", - "40.79.131.208/28", - "40.79.186.176/28", - "52.231.148.0/28", - "20.79.107.240/28", - "51.105.67.0/28", - "20.125.163.80/28", - "40.77.188.0/22", - "65.55.210.0/24", - "199.30.24.0/23", - "40.77.202.0/24", - "40.77.139.0/25", - "20.74.197.0/28", - "20.15.133.160/27", - "40.77.177.0/24", - "40.77.178.0/23" - ] -- name: duckduckbot - user_agent_regex: DuckDuckBot/1\.1; \(\+http\://duckduckgo\.com/duckduckbot\.html\) - action: ALLOW - # https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot - remote_addresses: [ - "57.152.72.128/32", - "51.8.253.152/32", - "40.80.242.63/32", - "20.12.141.99/32", - "20.49.136.28/32", - "51.116.131.221/32", - "51.107.40.209/32", - "20.40.133.240/32", - "20.50.168.91/32", - "51.120.48.122/32", - "20.193.45.113/32", - "40.76.173.151/32", - "40.76.163.7/32", - "20.185.79.47/32", - "52.142.26.175/32", - "20.185.79.15/32", - "52.142.24.149/32", - "40.76.162.208/32", - "40.76.163.23/32", - "40.76.162.191/32", - "40.76.162.247/32", - "40.88.21.235/32", - "20.191.45.212/32", - "52.146.59.12/32", - "52.146.59.156/32", - "52.146.59.154/32", - "52.146.58.236/32", - "20.62.224.44/32", - "51.104.180.53/32", - "51.104.180.47/32", - "51.104.180.26/32", - "51.104.146.225/32", - "51.104.146.235/32", - "20.73.202.147/32", - "20.73.132.240/32", - "20.71.12.143/32", - "20.56.197.58/32", - "20.56.197.63/32", - "20.43.150.93/32", - "20.43.150.85/32", - "20.44.222.1/32", - "40.89.243.175/32", - "13.89.106.77/32", - "52.143.242.6/32", - "52.143.241.111/32", - "52.154.60.82/32", - "20.197.209.11/32", - "20.197.209.27/32", - "20.226.133.105/32", - "191.234.216.4/32", - "191.234.216.178/32", - "20.53.92.211/32", - "20.53.91.2/32", - "20.207.99.197/32", - "20.207.97.190/32", - "40.81.250.205/32", - "40.64.106.11/32", - "40.64.105.247/32", - "20.72.242.93/32", - "20.99.255.235/32", - "20.113.3.121/32", - "52.224.16.221/32", - "52.224.21.53/32", - "52.224.20.204/32", - "52.224.21.19/32", - "52.224.20.249/32", - "52.224.20.203/32", - "52.224.20.190/32", - "52.224.16.229/32", - "52.224.21.20/32", - "52.146.63.80/32", - "52.224.20.227/32", - "52.224.20.193/32", - "52.190.37.160/32", - "52.224.21.23/32", - "52.224.20.223/32", - "52.224.20.181/32", - "52.224.21.49/32", - "52.224.21.55/32", - "52.224.21.61/32", - "52.224.19.152/32", - "52.224.20.186/32", - "52.224.21.27/32", - "52.224.21.51/32", - "52.224.20.174/32", - "52.224.21.4/32", - "51.104.164.109/32", - "51.104.167.71/32", - "51.104.160.177/32", - "51.104.162.149/32", - "51.104.167.95/32", - "51.104.167.54/32", - "51.104.166.111/32", - "51.104.167.88/32", - "51.104.161.32/32", - "51.104.163.250/32", - "51.104.164.189/32", - "51.104.167.19/32", - "51.104.160.167/32", - "51.104.167.110/32", - "20.191.44.119/32", - "51.104.167.104/32", - "20.191.44.234/32", - "51.104.164.215/32", - "51.104.167.52/32", - "20.191.44.22/32", - "51.104.167.87/32", - "51.104.167.96/32", - "20.191.44.16/32", - "51.104.167.61/32", - "51.104.164.147/32", - "20.50.48.159/32", - "40.114.182.172/32", - "20.50.50.130/32", - "20.50.50.163/32", - "20.50.50.46/32", - "40.114.182.153/32", - "20.50.50.118/32", - "20.50.49.55/32", - "20.50.49.25/32", - "40.114.183.251/32", - "20.50.50.123/32", - "20.50.49.237/32", - "20.50.48.192/32", - "20.50.50.134/32", - "51.138.90.233/32", - "40.114.183.196/32", - "20.50.50.146/32", - "40.114.183.88/32", - "20.50.50.145/32", - "20.50.50.121/32", - "20.50.49.40/32", - "51.138.90.206/32", - "40.114.182.45/32", - "51.138.90.161/32", - "20.50.49.0/32", - "40.119.232.215/32", - "104.43.55.167/32", - "40.119.232.251/32", - "40.119.232.50/32", - "40.119.232.146/32", - "40.119.232.218/32", - "104.43.54.127/32", - "104.43.55.117/32", - "104.43.55.116/32", - "104.43.55.166/32", - "52.154.169.50/32", - "52.154.171.70/32", - "52.154.170.229/32", - "52.154.170.113/32", - "52.154.171.44/32", - "52.154.172.2/32", - "52.143.244.81/32", - "52.154.171.87/32", - "52.154.171.250/32", - "52.154.170.28/32", - "52.154.170.122/32", - "52.143.243.117/32", - "52.143.247.235/32", - "52.154.171.235/32", - "52.154.171.196/32", - "52.154.171.0/32", - "52.154.170.243/32", - "52.154.170.26/32", - "52.154.169.200/32", - "52.154.170.96/32", - "52.154.170.88/32", - "52.154.171.150/32", - "52.154.171.205/32", - "52.154.170.117/32", - "52.154.170.209/32", - "191.235.202.48/32", - "191.233.3.202/32", - "191.235.201.214/32", - "191.233.3.197/32", - "191.235.202.38/32", - "20.53.78.144/32", - "20.193.24.10/32", - "20.53.78.236/32", - "20.53.78.138/32", - "20.53.78.123/32", - "20.53.78.106/32", - "20.193.27.215/32", - "20.193.25.197/32", - "20.193.12.126/32", - "20.193.24.251/32", - "20.204.242.101/32", - "20.207.72.113/32", - "20.204.242.19/32", - "20.219.45.67/32", - "20.207.72.11/32", - "20.219.45.190/32", - "20.204.243.55/32", - "20.204.241.148/32", - "20.207.72.110/32", - "20.204.240.172/32", - "20.207.72.21/32", - "20.204.246.81/32", - "20.207.107.181/32", - "20.204.246.254/32", - "20.219.43.246/32", - "52.149.25.43/32", - "52.149.61.51/32", - "52.149.58.139/32", - "52.149.60.38/32", - "52.148.165.38/32", - "52.143.95.162/32", - "52.149.56.151/32", - "52.149.30.45/32", - "52.149.58.173/32", - "52.143.95.204/32", - "52.149.28.83/32", - "52.149.58.69/32", - "52.148.161.87/32", - "52.149.58.27/32", - "52.149.28.18/32", - "20.79.226.26/32", - "20.79.239.66/32", - "20.79.238.198/32", - "20.113.14.159/32", - "20.75.144.152/32", - "20.43.172.120/32", - "20.53.134.160/32", - "20.201.15.208/32", - "20.93.28.24/32", - "20.61.34.40/32", - "52.242.224.168/32", - "20.80.129.80/32", - "20.195.108.47/32", - "4.195.133.120/32", - "4.228.76.163/32", - "4.182.131.108/32", - "4.209.224.56/32", - "108.141.83.74/32", - "4.213.46.14/32", - "172.169.17.165/32", - "51.8.71.117/32", - "20.3.1.178/32", - "52.149.56.151/32", - "52.149.30.45/32", - "52.149.58.173/32", - "52.143.95.204/32", - "52.149.28.83/32", - "52.149.58.69/32", - "52.148.161.87/32", - "52.149.58.27/32", - "52.149.28.18/32", - "20.79.226.26/32", - "20.79.239.66/32", - "20.79.238.198/32", - "20.113.14.159/32", - "20.75.144.152/32", - "20.43.172.120/32", - "20.53.134.160/32", - "20.201.15.208/32", - "20.93.28.24/32", - "20.61.34.40/32", - "52.242.224.168/32", - "20.80.129.80/32", - "20.195.108.47/32", - "4.195.133.120/32", - "4.228.76.163/32", - "4.182.131.108/32", - "4.209.224.56/32", - "108.141.83.74/32", - "4.213.46.14/32", - "172.169.17.165/32", - "51.8.71.117/32", - "20.3.1.178/32" - ] -- name: qwantbot - user_agent_regex: \+https\://help\.qwant\.com/bot/ - action: ALLOW - # https://help.qwant.com/wp-content/uploads/sites/2/2025/01/qwantbot.json - remote_addresses: [ "91.242.162.0/24" ] -- name: internet-archive - action: ALLOW - # https://ipinfo.io/AS7941 - remote_addresses: [ - "207.241.224.0/20", - "208.70.24.0/21", - "2620:0:9c0::/48" - ] -- name: kagibot - user_agent_regex: \+https\://kagi\.com/bot - action: ALLOW - # https://kagi.com/bot - remote_addresses: [ - "216.18.205.234/32", - "35.212.27.76/32", - "104.254.65.50/32", - "209.151.156.194/32" - ] -- name: marginalia - user_agent_regex: search\.marginalia\.nu - action: ALLOW - # Received directly over email - remote_addresses: [ - "193.183.0.162/31", - "193.183.0.164/30", - "193.183.0.168/30", - "193.183.0.172/31", - "193.183.0.174/32" - ] -- name: mojeekbot - user_agent_regex: http\://www\.mojeek\.com/bot\.html - action: ALLOW - # https://www.mojeek.com/bot.html - remote_addresses: [ "5.102.173.71/32" ] +# Search engines to allow +- import: (data)/crawlers/googlebot.yaml +- import: (data)/crawlers/bingbot.yaml +- import: (data)/crawlers/duckduckbot.yaml +- import: (data)/crawlers/qwantbot.yaml +- import: (data)/crawlers/internet-archive.yaml +- import: (data)/crawlers/kagibot.yaml +- import: (data)/crawlers/marginalia.yaml +- import: (data)/crawlers/mojeekbot.yaml -# Common "keeping the internet working" routes -- name: well-known - path_regex: ^/.well-known/.*$ - action: ALLOW -- name: favicon - path_regex: ^/favicon.ico$ - action: ALLOW -- name: robots-txt - path_regex: ^/robots.txt$ - action: ALLOW +# Allow common "keeping the internet working" routes (well-known, favicon, robots.txt) +- import: (data)/common/keep-internet-working.yaml # # Punish any bot with "bot" in the user-agent string +# # This is known to have a high false-positive rate, use at your own risk # - name: generic-bot-catchall # user_agent_regex: (?i:bot|crawler) # action: CHALLENGE @@ -654,6 +41,7 @@ bots: # report_as: 4 # lie to the operator # algorithm: slow # intentionally waste CPU cycles and time +# Generic catchall rule - name: generic-browser user_agent_regex: > Mozilla|Opera diff --git a/data/bots/ai-robots-txt.yaml b/data/bots/ai-robots-txt.yaml new file mode 100644 index 0000000..19cbe93 --- /dev/null +++ b/data/bots/ai-robots-txt.yaml @@ -0,0 +1,4 @@ +- name: "ai-robots-txt" + user_agent_regex: > + AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|Perplexity-User|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot + action: DENY \ No newline at end of file diff --git a/data/bots/cloudflare-workers.yaml b/data/bots/cloudflare-workers.yaml new file mode 100644 index 0000000..3fe051b --- /dev/null +++ b/data/bots/cloudflare-workers.yaml @@ -0,0 +1,4 @@ +- name: cloudflare-workers + headers_regex: + CF-Worker: .* + action: DENY \ No newline at end of file diff --git a/data/bots/headless-browsers.yaml b/data/bots/headless-browsers.yaml new file mode 100644 index 0000000..9805290 --- /dev/null +++ b/data/bots/headless-browsers.yaml @@ -0,0 +1,9 @@ +- name: lightpanda + user_agent_regex: ^LightPanda/.*$ + action: DENY +- name: headless-chrome + user_agent_regex: HeadlessChrome + action: DENY +- name: headless-chromium + user_agent_regex: HeadlessChromium + action: DENY \ No newline at end of file diff --git a/data/bots/us-ai-scraper.yaml b/data/bots/us-ai-scraper.yaml new file mode 100644 index 0000000..b68920f --- /dev/null +++ b/data/bots/us-ai-scraper.yaml @@ -0,0 +1,3 @@ +- name: us-artificial-intelligence-scraper + user_agent_regex: \+https\://github\.com/US-Artificial-Intelligence/scraper + action: DENY \ No newline at end of file diff --git a/data/common/allow-private-addresses.yaml b/data/common/allow-private-addresses.yaml new file mode 100644 index 0000000..3a3c0dc --- /dev/null +++ b/data/common/allow-private-addresses.yaml @@ -0,0 +1,15 @@ +- name: ipv4-rfc-1918 + action: ALLOW + remote_addresses: + - 10.0.0.0/8 + - 172.16.0.0/12 + - 192.168.0.0/16 + - 100.64.0.0/10 +- name: ipv6-ula + action: ALLOW + remote_addresses: + - fc00::/7 +- name: ipv6-link-local + action: ALLOW + remote_addresses: + - fe80::/10 \ No newline at end of file diff --git a/data/common/keep-internet-working.yaml b/data/common/keep-internet-working.yaml new file mode 100644 index 0000000..8270ef4 --- /dev/null +++ b/data/common/keep-internet-working.yaml @@ -0,0 +1,10 @@ +# Common "keeping the internet working" routes +- name: well-known + path_regex: ^/.well-known/.*$ + action: ALLOW +- name: favicon + path_regex: ^/favicon.ico$ + action: ALLOW +- name: robots-txt + path_regex: ^/robots.txt$ + action: ALLOW \ No newline at end of file diff --git a/data/crawlers/bingbot.yaml b/data/crawlers/bingbot.yaml new file mode 100644 index 0000000..2f7885d --- /dev/null +++ b/data/crawlers/bingbot.yaml @@ -0,0 +1,34 @@ +- name: bingbot + user_agent_regex: \+http\://www\.bing\.com/bingbot\.htm + action: ALLOW + # https://www.bing.com/toolbox/bingbot.json + remote_addresses: [ + "157.55.39.0/24", + "207.46.13.0/24", + "40.77.167.0/24", + "13.66.139.0/24", + "13.66.144.0/24", + "52.167.144.0/24", + "13.67.10.16/28", + "13.69.66.240/28", + "13.71.172.224/28", + "139.217.52.0/28", + "191.233.204.224/28", + "20.36.108.32/28", + "20.43.120.16/28", + "40.79.131.208/28", + "40.79.186.176/28", + "52.231.148.0/28", + "20.79.107.240/28", + "51.105.67.0/28", + "20.125.163.80/28", + "40.77.188.0/22", + "65.55.210.0/24", + "199.30.24.0/23", + "40.77.202.0/24", + "40.77.139.0/25", + "20.74.197.0/28", + "20.15.133.160/27", + "40.77.177.0/24", + "40.77.178.0/23" + ] diff --git a/data/crawlers/duckduckbot.yaml b/data/crawlers/duckduckbot.yaml new file mode 100644 index 0000000..302a1e3 --- /dev/null +++ b/data/crawlers/duckduckbot.yaml @@ -0,0 +1,275 @@ +- name: duckduckbot + user_agent_regex: DuckDuckBot/1\.1; \(\+http\://duckduckgo\.com/duckduckbot\.html\) + action: ALLOW + # https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot + remote_addresses: [ + "57.152.72.128/32", + "51.8.253.152/32", + "40.80.242.63/32", + "20.12.141.99/32", + "20.49.136.28/32", + "51.116.131.221/32", + "51.107.40.209/32", + "20.40.133.240/32", + "20.50.168.91/32", + "51.120.48.122/32", + "20.193.45.113/32", + "40.76.173.151/32", + "40.76.163.7/32", + "20.185.79.47/32", + "52.142.26.175/32", + "20.185.79.15/32", + "52.142.24.149/32", + "40.76.162.208/32", + "40.76.163.23/32", + "40.76.162.191/32", + "40.76.162.247/32", + "40.88.21.235/32", + "20.191.45.212/32", + "52.146.59.12/32", + "52.146.59.156/32", + "52.146.59.154/32", + "52.146.58.236/32", + "20.62.224.44/32", + "51.104.180.53/32", + "51.104.180.47/32", + "51.104.180.26/32", + "51.104.146.225/32", + "51.104.146.235/32", + "20.73.202.147/32", + "20.73.132.240/32", + "20.71.12.143/32", + "20.56.197.58/32", + "20.56.197.63/32", + "20.43.150.93/32", + "20.43.150.85/32", + "20.44.222.1/32", + "40.89.243.175/32", + "13.89.106.77/32", + "52.143.242.6/32", + "52.143.241.111/32", + "52.154.60.82/32", + "20.197.209.11/32", + "20.197.209.27/32", + "20.226.133.105/32", + "191.234.216.4/32", + "191.234.216.178/32", + "20.53.92.211/32", + "20.53.91.2/32", + "20.207.99.197/32", + "20.207.97.190/32", + "40.81.250.205/32", + "40.64.106.11/32", + "40.64.105.247/32", + "20.72.242.93/32", + "20.99.255.235/32", + "20.113.3.121/32", + "52.224.16.221/32", + "52.224.21.53/32", + "52.224.20.204/32", + "52.224.21.19/32", + "52.224.20.249/32", + "52.224.20.203/32", + "52.224.20.190/32", + "52.224.16.229/32", + "52.224.21.20/32", + "52.146.63.80/32", + "52.224.20.227/32", + "52.224.20.193/32", + "52.190.37.160/32", + "52.224.21.23/32", + "52.224.20.223/32", + "52.224.20.181/32", + "52.224.21.49/32", + "52.224.21.55/32", + "52.224.21.61/32", + "52.224.19.152/32", + "52.224.20.186/32", + "52.224.21.27/32", + "52.224.21.51/32", + "52.224.20.174/32", + "52.224.21.4/32", + "51.104.164.109/32", + "51.104.167.71/32", + "51.104.160.177/32", + "51.104.162.149/32", + "51.104.167.95/32", + "51.104.167.54/32", + "51.104.166.111/32", + "51.104.167.88/32", + "51.104.161.32/32", + "51.104.163.250/32", + "51.104.164.189/32", + "51.104.167.19/32", + "51.104.160.167/32", + "51.104.167.110/32", + "20.191.44.119/32", + "51.104.167.104/32", + "20.191.44.234/32", + "51.104.164.215/32", + "51.104.167.52/32", + "20.191.44.22/32", + "51.104.167.87/32", + "51.104.167.96/32", + "20.191.44.16/32", + "51.104.167.61/32", + "51.104.164.147/32", + "20.50.48.159/32", + "40.114.182.172/32", + "20.50.50.130/32", + "20.50.50.163/32", + "20.50.50.46/32", + "40.114.182.153/32", + "20.50.50.118/32", + "20.50.49.55/32", + "20.50.49.25/32", + "40.114.183.251/32", + "20.50.50.123/32", + "20.50.49.237/32", + "20.50.48.192/32", + "20.50.50.134/32", + "51.138.90.233/32", + "40.114.183.196/32", + "20.50.50.146/32", + "40.114.183.88/32", + "20.50.50.145/32", + "20.50.50.121/32", + "20.50.49.40/32", + "51.138.90.206/32", + "40.114.182.45/32", + "51.138.90.161/32", + "20.50.49.0/32", + "40.119.232.215/32", + "104.43.55.167/32", + "40.119.232.251/32", + "40.119.232.50/32", + "40.119.232.146/32", + "40.119.232.218/32", + "104.43.54.127/32", + "104.43.55.117/32", + "104.43.55.116/32", + "104.43.55.166/32", + "52.154.169.50/32", + "52.154.171.70/32", + "52.154.170.229/32", + "52.154.170.113/32", + "52.154.171.44/32", + "52.154.172.2/32", + "52.143.244.81/32", + "52.154.171.87/32", + "52.154.171.250/32", + "52.154.170.28/32", + "52.154.170.122/32", + "52.143.243.117/32", + "52.143.247.235/32", + "52.154.171.235/32", + "52.154.171.196/32", + "52.154.171.0/32", + "52.154.170.243/32", + "52.154.170.26/32", + "52.154.169.200/32", + "52.154.170.96/32", + "52.154.170.88/32", + "52.154.171.150/32", + "52.154.171.205/32", + "52.154.170.117/32", + "52.154.170.209/32", + "191.235.202.48/32", + "191.233.3.202/32", + "191.235.201.214/32", + "191.233.3.197/32", + "191.235.202.38/32", + "20.53.78.144/32", + "20.193.24.10/32", + "20.53.78.236/32", + "20.53.78.138/32", + "20.53.78.123/32", + "20.53.78.106/32", + "20.193.27.215/32", + "20.193.25.197/32", + "20.193.12.126/32", + "20.193.24.251/32", + "20.204.242.101/32", + "20.207.72.113/32", + "20.204.242.19/32", + "20.219.45.67/32", + "20.207.72.11/32", + "20.219.45.190/32", + "20.204.243.55/32", + "20.204.241.148/32", + "20.207.72.110/32", + "20.204.240.172/32", + "20.207.72.21/32", + "20.204.246.81/32", + "20.207.107.181/32", + "20.204.246.254/32", + "20.219.43.246/32", + "52.149.25.43/32", + "52.149.61.51/32", + "52.149.58.139/32", + "52.149.60.38/32", + "52.148.165.38/32", + "52.143.95.162/32", + "52.149.56.151/32", + "52.149.30.45/32", + "52.149.58.173/32", + "52.143.95.204/32", + "52.149.28.83/32", + "52.149.58.69/32", + "52.148.161.87/32", + "52.149.58.27/32", + "52.149.28.18/32", + "20.79.226.26/32", + "20.79.239.66/32", + "20.79.238.198/32", + "20.113.14.159/32", + "20.75.144.152/32", + "20.43.172.120/32", + "20.53.134.160/32", + "20.201.15.208/32", + "20.93.28.24/32", + "20.61.34.40/32", + "52.242.224.168/32", + "20.80.129.80/32", + "20.195.108.47/32", + "4.195.133.120/32", + "4.228.76.163/32", + "4.182.131.108/32", + "4.209.224.56/32", + "108.141.83.74/32", + "4.213.46.14/32", + "172.169.17.165/32", + "51.8.71.117/32", + "20.3.1.178/32", + "52.149.56.151/32", + "52.149.30.45/32", + "52.149.58.173/32", + "52.143.95.204/32", + "52.149.28.83/32", + "52.149.58.69/32", + "52.148.161.87/32", + "52.149.58.27/32", + "52.149.28.18/32", + "20.79.226.26/32", + "20.79.239.66/32", + "20.79.238.198/32", + "20.113.14.159/32", + "20.75.144.152/32", + "20.43.172.120/32", + "20.53.134.160/32", + "20.201.15.208/32", + "20.93.28.24/32", + "20.61.34.40/32", + "52.242.224.168/32", + "20.80.129.80/32", + "20.195.108.47/32", + "4.195.133.120/32", + "4.228.76.163/32", + "4.182.131.108/32", + "4.209.224.56/32", + "108.141.83.74/32", + "4.213.46.14/32", + "172.169.17.165/32", + "51.8.71.117/32", + "20.3.1.178/32" + ] diff --git a/data/crawlers/googlebot.yaml b/data/crawlers/googlebot.yaml new file mode 100644 index 0000000..f173512 --- /dev/null +++ b/data/crawlers/googlebot.yaml @@ -0,0 +1,263 @@ +- name: googlebot + user_agent_regex: \+http\://www\.google\.com/bot\.html + action: ALLOW + # https://developers.google.com/static/search/apis/ipranges/googlebot.json + remote_addresses: [ + "2001:4860:4801:10::/64", + "2001:4860:4801:11::/64", + "2001:4860:4801:12::/64", + "2001:4860:4801:13::/64", + "2001:4860:4801:14::/64", + "2001:4860:4801:15::/64", + "2001:4860:4801:16::/64", + "2001:4860:4801:17::/64", + "2001:4860:4801:18::/64", + "2001:4860:4801:19::/64", + "2001:4860:4801:1a::/64", + "2001:4860:4801:1b::/64", + "2001:4860:4801:1c::/64", + "2001:4860:4801:1d::/64", + "2001:4860:4801:1e::/64", + "2001:4860:4801:1f::/64", + "2001:4860:4801:20::/64", + "2001:4860:4801:21::/64", + "2001:4860:4801:22::/64", + "2001:4860:4801:23::/64", + "2001:4860:4801:24::/64", + "2001:4860:4801:25::/64", + "2001:4860:4801:26::/64", + "2001:4860:4801:27::/64", + "2001:4860:4801:28::/64", + "2001:4860:4801:29::/64", + "2001:4860:4801:2::/64", + "2001:4860:4801:2a::/64", + "2001:4860:4801:2b::/64", + "2001:4860:4801:2c::/64", + "2001:4860:4801:2d::/64", + "2001:4860:4801:2e::/64", + "2001:4860:4801:2f::/64", + "2001:4860:4801:31::/64", + "2001:4860:4801:32::/64", + "2001:4860:4801:33::/64", + "2001:4860:4801:34::/64", + "2001:4860:4801:35::/64", + "2001:4860:4801:36::/64", + "2001:4860:4801:37::/64", + "2001:4860:4801:38::/64", + "2001:4860:4801:39::/64", + "2001:4860:4801:3a::/64", + "2001:4860:4801:3b::/64", + "2001:4860:4801:3c::/64", + "2001:4860:4801:3d::/64", + "2001:4860:4801:3e::/64", + "2001:4860:4801:40::/64", + "2001:4860:4801:41::/64", + "2001:4860:4801:42::/64", + "2001:4860:4801:43::/64", + "2001:4860:4801:44::/64", + "2001:4860:4801:45::/64", + "2001:4860:4801:46::/64", + "2001:4860:4801:47::/64", + "2001:4860:4801:48::/64", + "2001:4860:4801:49::/64", + "2001:4860:4801:4a::/64", + "2001:4860:4801:4b::/64", + "2001:4860:4801:4c::/64", + "2001:4860:4801:50::/64", + "2001:4860:4801:51::/64", + "2001:4860:4801:52::/64", + "2001:4860:4801:53::/64", + "2001:4860:4801:54::/64", + "2001:4860:4801:55::/64", + "2001:4860:4801:56::/64", + "2001:4860:4801:60::/64", + "2001:4860:4801:61::/64", + "2001:4860:4801:62::/64", + "2001:4860:4801:63::/64", + "2001:4860:4801:64::/64", + "2001:4860:4801:65::/64", + "2001:4860:4801:66::/64", + "2001:4860:4801:67::/64", + "2001:4860:4801:68::/64", + "2001:4860:4801:69::/64", + "2001:4860:4801:6a::/64", + "2001:4860:4801:6b::/64", + "2001:4860:4801:6c::/64", + "2001:4860:4801:6d::/64", + "2001:4860:4801:6e::/64", + "2001:4860:4801:6f::/64", + "2001:4860:4801:70::/64", + "2001:4860:4801:71::/64", + "2001:4860:4801:72::/64", + "2001:4860:4801:73::/64", + "2001:4860:4801:74::/64", + "2001:4860:4801:75::/64", + "2001:4860:4801:76::/64", + "2001:4860:4801:77::/64", + "2001:4860:4801:78::/64", + "2001:4860:4801:79::/64", + "2001:4860:4801:80::/64", + "2001:4860:4801:81::/64", + "2001:4860:4801:82::/64", + "2001:4860:4801:83::/64", + "2001:4860:4801:84::/64", + "2001:4860:4801:85::/64", + "2001:4860:4801:86::/64", + "2001:4860:4801:87::/64", + "2001:4860:4801:88::/64", + "2001:4860:4801:90::/64", + "2001:4860:4801:91::/64", + "2001:4860:4801:92::/64", + "2001:4860:4801:93::/64", + "2001:4860:4801:94::/64", + "2001:4860:4801:95::/64", + "2001:4860:4801:96::/64", + "2001:4860:4801:a0::/64", + "2001:4860:4801:a1::/64", + "2001:4860:4801:a2::/64", + "2001:4860:4801:a3::/64", + "2001:4860:4801:a4::/64", + "2001:4860:4801:a5::/64", + "2001:4860:4801:c::/64", + "2001:4860:4801:f::/64", + "192.178.5.0/27", + "192.178.6.0/27", + "192.178.6.128/27", + "192.178.6.160/27", + "192.178.6.192/27", + "192.178.6.32/27", + "192.178.6.64/27", + "192.178.6.96/27", + "34.100.182.96/28", + "34.101.50.144/28", + "34.118.254.0/28", + "34.118.66.0/28", + "34.126.178.96/28", + "34.146.150.144/28", + "34.147.110.144/28", + "34.151.74.144/28", + "34.152.50.64/28", + "34.154.114.144/28", + "34.155.98.32/28", + "34.165.18.176/28", + "34.175.160.64/28", + "34.176.130.16/28", + "34.22.85.0/27", + "34.64.82.64/28", + "34.65.242.112/28", + "34.80.50.80/28", + "34.88.194.0/28", + "34.89.10.80/28", + "34.89.198.80/28", + "34.96.162.48/28", + "35.247.243.240/28", + "66.249.64.0/27", + "66.249.64.128/27", + "66.249.64.160/27", + "66.249.64.224/27", + "66.249.64.32/27", + "66.249.64.64/27", + "66.249.64.96/27", + "66.249.65.0/27", + "66.249.65.128/27", + "66.249.65.160/27", + "66.249.65.192/27", + "66.249.65.224/27", + "66.249.65.32/27", + "66.249.65.64/27", + "66.249.65.96/27", + "66.249.66.0/27", + "66.249.66.128/27", + "66.249.66.160/27", + "66.249.66.192/27", + "66.249.66.224/27", + "66.249.66.32/27", + "66.249.66.64/27", + "66.249.66.96/27", + "66.249.68.0/27", + "66.249.68.128/27", + "66.249.68.32/27", + "66.249.68.64/27", + "66.249.68.96/27", + "66.249.69.0/27", + "66.249.69.128/27", + "66.249.69.160/27", + "66.249.69.192/27", + "66.249.69.224/27", + "66.249.69.32/27", + "66.249.69.64/27", + "66.249.69.96/27", + "66.249.70.0/27", + "66.249.70.128/27", + "66.249.70.160/27", + "66.249.70.192/27", + "66.249.70.224/27", + "66.249.70.32/27", + "66.249.70.64/27", + "66.249.70.96/27", + "66.249.71.0/27", + "66.249.71.128/27", + "66.249.71.160/27", + "66.249.71.192/27", + "66.249.71.224/27", + "66.249.71.32/27", + "66.249.71.64/27", + "66.249.71.96/27", + "66.249.72.0/27", + "66.249.72.128/27", + "66.249.72.160/27", + "66.249.72.192/27", + "66.249.72.224/27", + "66.249.72.32/27", + "66.249.72.64/27", + "66.249.72.96/27", + "66.249.73.0/27", + "66.249.73.128/27", + "66.249.73.160/27", + "66.249.73.192/27", + "66.249.73.224/27", + "66.249.73.32/27", + "66.249.73.64/27", + "66.249.73.96/27", + "66.249.74.0/27", + "66.249.74.128/27", + "66.249.74.160/27", + "66.249.74.192/27", + "66.249.74.32/27", + "66.249.74.64/27", + "66.249.74.96/27", + "66.249.75.0/27", + "66.249.75.128/27", + "66.249.75.160/27", + "66.249.75.192/27", + "66.249.75.224/27", + "66.249.75.32/27", + "66.249.75.64/27", + "66.249.75.96/27", + "66.249.76.0/27", + "66.249.76.128/27", + "66.249.76.160/27", + "66.249.76.192/27", + "66.249.76.224/27", + "66.249.76.32/27", + "66.249.76.64/27", + "66.249.76.96/27", + "66.249.77.0/27", + "66.249.77.128/27", + "66.249.77.160/27", + "66.249.77.192/27", + "66.249.77.224/27", + "66.249.77.32/27", + "66.249.77.64/27", + "66.249.77.96/27", + "66.249.78.0/27", + "66.249.78.32/27", + "66.249.79.0/27", + "66.249.79.128/27", + "66.249.79.160/27", + "66.249.79.192/27", + "66.249.79.224/27", + "66.249.79.32/27", + "66.249.79.64/27", + "66.249.79.96/27" + ] diff --git a/data/crawlers/internet-archive.yaml b/data/crawlers/internet-archive.yaml new file mode 100644 index 0000000..5e209e8 --- /dev/null +++ b/data/crawlers/internet-archive.yaml @@ -0,0 +1,8 @@ +- name: internet-archive + action: ALLOW + # https://ipinfo.io/AS7941 + remote_addresses: [ + "207.241.224.0/20", + "208.70.24.0/21", + "2620:0:9c0::/48" + ] \ No newline at end of file diff --git a/data/crawlers/kagibot.yaml b/data/crawlers/kagibot.yaml new file mode 100644 index 0000000..db62b57 --- /dev/null +++ b/data/crawlers/kagibot.yaml @@ -0,0 +1,10 @@ +- name: kagibot + user_agent_regex: \+https\://kagi\.com/bot + action: ALLOW + # https://kagi.com/bot + remote_addresses: [ + "216.18.205.234/32", + "35.212.27.76/32", + "104.254.65.50/32", + "209.151.156.194/32" + ] diff --git a/data/crawlers/marginalia.yaml b/data/crawlers/marginalia.yaml new file mode 100644 index 0000000..e12ebc4 --- /dev/null +++ b/data/crawlers/marginalia.yaml @@ -0,0 +1,11 @@ +- name: marginalia + user_agent_regex: search\.marginalia\.nu + action: ALLOW + # Received directly over email + remote_addresses: [ + "193.183.0.162/31", + "193.183.0.164/30", + "193.183.0.168/30", + "193.183.0.172/31", + "193.183.0.174/32" + ] \ No newline at end of file diff --git a/data/crawlers/mojeekbot.yaml b/data/crawlers/mojeekbot.yaml new file mode 100644 index 0000000..fcd20f5 --- /dev/null +++ b/data/crawlers/mojeekbot.yaml @@ -0,0 +1,5 @@ +- name: mojeekbot + user_agent_regex: http\://www\.mojeek\.com/bot\.html + action: ALLOW + # https://www.mojeek.com/bot.html + remote_addresses: [ "5.102.173.71/32" ] \ No newline at end of file diff --git a/data/crawlers/qwantbot.yaml b/data/crawlers/qwantbot.yaml new file mode 100644 index 0000000..a402154 --- /dev/null +++ b/data/crawlers/qwantbot.yaml @@ -0,0 +1,5 @@ +- name: qwantbot + user_agent_regex: \+https\://help\.qwant\.com/bot/ + action: ALLOW + # https://help.qwant.com/wp-content/uploads/sites/2/2025/01/qwantbot.json + remote_addresses: [ "91.242.162.0/24" ] diff --git a/data/embed.go b/data/embed.go index c1fbe68..ebb2152 100644 --- a/data/embed.go +++ b/data/embed.go @@ -3,6 +3,6 @@ package data import "embed" var ( - //go:embed botPolicies.yaml botPolicies.json + //go:embed botPolicies.yaml botPolicies.json apps bots common crawlers BotPolicies embed.FS ) diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md index 71cc42a..045c34d 100644 --- a/docs/docs/CHANGELOG.md +++ b/docs/docs/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- Enable [importing configuration snippets](./admin/configuration/import.mdx) ([#321](https://github.com/TecharoHQ/anubis/pull/321)) - Refactor check logic to be more generic and work on a Checker type - Add more AI user agents based on the [ai.robots.txt](https://github.com/ai-robots-txt/ai.robots.txt) project - Embedded challenge data in initial HTML response to improve performance diff --git a/docs/docs/admin/configuration/import.mdx b/docs/docs/admin/configuration/import.mdx new file mode 100644 index 0000000..9934ce7 --- /dev/null +++ b/docs/docs/admin/configuration/import.mdx @@ -0,0 +1,147 @@ +# Importing configuration rules + +import Tabs from "@theme/Tabs"; +import TabItem from "@theme/TabItem"; + +Anubis has the ability to let you import snippets of configuration into the main configuration file. This allows you to break up your config into smaller parts that get logically assembled into one big file. + +EG: + + + + +```json +{ + "bots": [ + { + "import": "(data)/bots/ai-robots-txt.yaml" + }, + { + "import": "(data)/bots/cloudflare-workers.yaml" + } + ] +} +``` + + + + +```yaml +bots: + # Pathological bots to deny + - # This correlates to data/bots/ai-robots-txt.yaml in the source tree + import: (data)/bots/ai-robots-txt.yaml + - import: (data)/bots/cloudflare-workers.yaml +``` + + + + +Of note, a bot rule can either have inline bot configuration or import a bot config snippet. You cannot do both in a single bot rule. + + + + +```json +{ + "bots": [ + { + "import": "(data)/bots/ai-robots-txt.yaml", + "name": "generic-browser", + "user_agent_regex": "Mozilla|Opera\n", + "action": "CHALLENGE" + } + ] +} +``` + + + + +```yaml +bots: + - import: (data)/bots/ai-robots-txt.yaml + name: generic-browser + user_agent_regex: > + Mozilla|Opera + action: CHALLENGE +``` + + + + +This will return an error like this: + +```text +config is not valid: +config.BotOrImport: rule definition is invalid, you must set either bot rules or an import statement, not both +``` + +Paths can either be prefixed with `(data)` to import from the [the data folder in the Anubis source tree](https://github.com/TecharoHQ/anubis/tree/main/data) or anywhere on the filesystem. If you don't have access to the Anubis source tree, check /usr/share/docs/anubis/data or in the tarball you extracted Anubis from. + +## Writing snippets + +Snippets can be written in either JSON or YAML, with a preference for YAML. When writing a snippet, write the bot rules you want directly at the top level of the file in a list. + +Here is an example snippet that allows [IPv6 Unique Local Addresses](https://en.wikipedia.org/wiki/Unique_local_address) through Anubis: + + + + +```json +[ + { + "name": "ipv6-ula", + "action": "ALLOW", + "remote_addresses": ["fc00::/7"] + } +] +``` + + + + +```yaml +- name: ipv6-ula + action: ALLOW + remote_addresses: + - fc00::/7 +``` + + + + +## Extracting Anubis' embedded filesystem + +You can always extract the list of rules embedded into the Anubis binary with this command: + +```text +anubis --extract-resources=static +``` + +This will dump the contents of Anubis' embedded data to a new folder named `static`: + +```text +static +├── apps +│ └── gitea-rss-feeds.yaml +├── botPolicies.json +├── botPolicies.yaml +├── bots +│ ├── ai-robots-txt.yaml +│ ├── cloudflare-workers.yaml +│ ├── headless-browsers.yaml +│ └── us-ai-scraper.yaml +├── common +│ ├── allow-private-addresses.yaml +│ └── keep-internet-working.yaml +└── crawlers + ├── bingbot.yaml + ├── duckduckbot.yaml + ├── googlebot.yaml + ├── internet-archive.yaml + ├── kagibot.yaml + ├── marginalia.yaml + ├── mojeekbot.yaml + └── qwantbot.yaml +``` diff --git a/docs/docs/admin/policies.mdx b/docs/docs/admin/policies.mdx index 11af725..b23a62f 100644 --- a/docs/docs/admin/policies.mdx +++ b/docs/docs/admin/policies.mdx @@ -12,6 +12,7 @@ Bot policies let you customize the rules that Anubis uses to allow, deny, or cha - Request path - User agent string - HTTP request header values +- [Importing other configuration snippets](./configuration/import.mdx) As of version v1.17.0 or later, configuration can be written in either JSON or YAML. diff --git a/lib/policy/config/config.go b/lib/policy/config/config.go index b3d5cac..627e9cf 100644 --- a/lib/policy/config/config.go +++ b/lib/policy/config/config.go @@ -3,8 +3,15 @@ package config import ( "errors" "fmt" + "io" + "io/fs" "net" + "os" "regexp" + "strings" + + "github.com/TecharoHQ/anubis/data" + "k8s.io/apimachinery/pkg/util/yaml" ) var ( @@ -17,6 +24,9 @@ var ( ErrInvalidPathRegex = errors.New("config.Bot: invalid path regex") ErrInvalidHeadersRegex = errors.New("config.Bot: invalid headers regex") ErrInvalidCIDR = errors.New("config.Bot: invalid CIDR") + ErrInvalidImportStatement = errors.New("config.ImportStatement: invalid source file") + ErrCantSetBotAndImportValuesAtOnce = errors.New("config.BotOrImport: can't set bot rules and import values at the same time") + ErrMustSetBotOrImportRules = errors.New("config.BotOrImport: rule definition is invalid, you must set either bot rules or an import statement, not both") ) type Rule string @@ -47,6 +57,24 @@ type BotConfig struct { Challenge *ChallengeRules `json:"challenge,omitempty"` } +func (b BotConfig) Zero() bool { + for _, cond := range []bool{ + b.Name != "", + b.UserAgentRegex != nil, + b.PathRegex != nil, + len(b.HeadersRegex) != 0, + b.Action != "", + len(b.RemoteAddr) != 0, + b.Challenge != nil, + } { + if cond { + return false + } + } + + return true +} + func (b BotConfig) Valid() error { var errs []error @@ -151,9 +179,147 @@ func (cr ChallengeRules) Valid() error { return nil } +type ImportStatement struct { + Import string `json:"import"` + Bots []BotConfig +} + +func (is *ImportStatement) open() (fs.File, error) { + if strings.HasPrefix(is.Import, "(data)/") { + fname := strings.TrimPrefix(is.Import, "(data)/") + fin, err := data.BotPolicies.Open(fname) + return fin, err + } + + return os.Open(is.Import) +} + +func (is *ImportStatement) load() error { + fin, err := is.open() + if err != nil { + return fmt.Errorf("can't open %s: %w", is.Import, err) + } + defer fin.Close() + + var result []BotConfig + + if err := yaml.NewYAMLToJSONDecoder(fin).Decode(&result); err != nil { + return fmt.Errorf("can't parse %s: %w", is.Import, err) + } + + var errs []error + + for _, b := range result { + if err := b.Valid(); err != nil { + errs = append(errs, err) + } + } + + if len(errs) != 0 { + return fmt.Errorf("config %s is not valid:\n%w", is.Import, errors.Join(errs...)) + } + + is.Bots = result + + return nil +} + +func (is *ImportStatement) Valid() error { + return is.load() +} + +type BotOrImport struct { + *BotConfig `json:",inline"` + *ImportStatement `json:",inline"` +} + +func (boi *BotOrImport) Valid() error { + if boi.BotConfig != nil && boi.ImportStatement != nil { + return ErrCantSetBotAndImportValuesAtOnce + } + + if boi.BotConfig != nil { + return boi.BotConfig.Valid() + } + + if boi.ImportStatement != nil { + return boi.ImportStatement.Valid() + } + + return ErrMustSetBotOrImportRules +} + +type fileConfig struct { + Bots []BotOrImport `json:"bots"` + DNSBL bool `json:"dnsbl"` +} + +func (c fileConfig) Valid() error { + var errs []error + + if len(c.Bots) == 0 { + errs = append(errs, ErrNoBotRulesDefined) + } + + for _, b := range c.Bots { + if err := b.Valid(); err != nil { + errs = append(errs, err) + } + } + + if len(errs) != 0 { + return fmt.Errorf("config is not valid:\n%w", errors.Join(errs...)) + } + + return nil +} + +func Load(fin io.Reader, fname string) (*Config, error) { + var c fileConfig + if err := yaml.NewYAMLToJSONDecoder(fin).Decode(&c); err != nil { + return nil, fmt.Errorf("can't parse policy config YAML %s: %w", fname, err) + } + + if err := c.Valid(); err != nil { + return nil, err + } + + result := &Config{ + DNSBL: c.DNSBL, + } + + var validationErrs []error + + for _, boi := range c.Bots { + if boi.ImportStatement != nil { + if err := boi.load(); err != nil { + validationErrs = append(validationErrs, err) + continue + } + + result.Bots = append(result.Bots, boi.ImportStatement.Bots...) + } + + if boi.BotConfig != nil { + if err := boi.BotConfig.Valid(); err != nil { + validationErrs = append(validationErrs, err) + continue + } + + result.Bots = append(result.Bots, *boi.BotConfig) + } + } + + if len(validationErrs) > 0 { + return nil, fmt.Errorf("errors validating policy config %s: %w", fname, errors.Join(validationErrs...)) + } + + return result, nil +} + type Config struct { - Bots []BotConfig `json:"bots"` - DNSBL bool `json:"dnsbl"` + Bots []BotConfig + DNSBL bool } func (c Config) Valid() error { diff --git a/lib/policy/config/config_test.go b/lib/policy/config/config_test.go index 4176126..86c490e 100644 --- a/lib/policy/config/config_test.go +++ b/lib/policy/config/config_test.go @@ -2,10 +2,12 @@ package config import ( "errors" + "io/fs" "os" "path/filepath" "testing" + "github.com/TecharoHQ/anubis/data" "k8s.io/apimachinery/pkg/util/yaml" ) @@ -219,13 +221,69 @@ func TestConfigValidKnownGood(t *testing.T) { } defer fin.Close() - var c Config - if err := yaml.NewYAMLToJSONDecoder(fin).Decode(&c); err != nil { - t.Fatalf("can't decode file: %v", err) + c, err := Load(fin, st.Name()) + if err != nil { + t.Fatal(err) } if err := c.Valid(); err != nil { - t.Fatal(err) + t.Error(err) + } + + if len(c.Bots) == 0 { + t.Error("wanted more than 0 bots, got zero") + } + }) + } +} + +func TestImportStatement(t *testing.T) { + type testCase struct { + name string + importPath string + err error + } + + var tests []testCase + + for _, folderName := range []string{ + "apps", + "bots", + "common", + "crawlers", + } { + if err := fs.WalkDir(data.BotPolicies, folderName, func(path string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + if d.IsDir() { + return nil + } + + tests = append(tests, testCase{ + name: "(data)/" + path, + importPath: "(data)/" + path, + err: nil, + }) + + return nil + }); err != nil { + t.Fatal(err) + } + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + is := &ImportStatement{ + Import: tt.importPath, + } + + if err := is.Valid(); err != nil { + t.Errorf("validation error: %v", err) + } + + if len(is.Bots) == 0 { + t.Error("wanted bot definitions, but got none") } }) } @@ -246,7 +304,7 @@ func TestConfigValidBad(t *testing.T) { } defer fin.Close() - var c Config + var c fileConfig if err := yaml.NewYAMLToJSONDecoder(fin).Decode(&c); err != nil { t.Fatalf("can't decode file: %v", err) } @@ -259,3 +317,49 @@ func TestConfigValidBad(t *testing.T) { }) } } + +func TestBotConfigZero(t *testing.T) { + var b BotConfig + if !b.Zero() { + t.Error("zero value BotConfig is not zero value") + } + + b.Name = "hi" + if b.Zero() { + t.Error("BotConfig with name is zero value") + } + + b.UserAgentRegex = p(".*") + if b.Zero() { + t.Error("BotConfig with user agent regex is zero value") + } + + b.PathRegex = p(".*") + if b.Zero() { + t.Error("BotConfig with path regex is zero value") + } + + b.HeadersRegex = map[string]string{"hi": "there"} + if b.Zero() { + t.Error("BotConfig with headers regex is zero value") + } + + b.Action = RuleAllow + if b.Zero() { + t.Error("BotConfig with action is zero value") + } + + b.RemoteAddr = []string{"::/0"} + if b.Zero() { + t.Error("BotConfig with remote addresses is zero value") + } + + b.Challenge = &ChallengeRules{ + Difficulty: 4, + ReportAs: 4, + Algorithm: AlgorithmFast, + } + if b.Zero() { + t.Error("BotConfig with challenge rules is zero value") + } +} diff --git a/lib/policy/config/testdata/bad/import_and_bot.json b/lib/policy/config/testdata/bad/import_and_bot.json new file mode 100644 index 0000000..7fa4255 --- /dev/null +++ b/lib/policy/config/testdata/bad/import_and_bot.json @@ -0,0 +1,10 @@ +{ + "bots": [ + { + "import": "(data)/bots/ai-robots-txt.yaml", + "name": "generic-browser", + "user_agent_regex": "Mozilla|Opera\n", + "action": "CHALLENGE" + } + ] +} \ No newline at end of file diff --git a/lib/policy/config/testdata/bad/import_and_bot.yaml b/lib/policy/config/testdata/bad/import_and_bot.yaml new file mode 100644 index 0000000..0080b10 --- /dev/null +++ b/lib/policy/config/testdata/bad/import_and_bot.yaml @@ -0,0 +1,6 @@ +bots: +- import: (data)/bots/ai-robots-txt.yaml + name: generic-browser + user_agent_regex: > + Mozilla|Opera + action: CHALLENGE \ No newline at end of file diff --git a/lib/policy/config/testdata/bad/import_invalid_file.json b/lib/policy/config/testdata/bad/import_invalid_file.json new file mode 100644 index 0000000..c7546c0 --- /dev/null +++ b/lib/policy/config/testdata/bad/import_invalid_file.json @@ -0,0 +1,7 @@ +{ + "bots": [ + { + "import": "(data)/does-not-exist-fake-file.yaml" + } + ] +} \ No newline at end of file diff --git a/lib/policy/config/testdata/bad/import_invalid_file.yaml b/lib/policy/config/testdata/bad/import_invalid_file.yaml new file mode 100644 index 0000000..df78c06 --- /dev/null +++ b/lib/policy/config/testdata/bad/import_invalid_file.yaml @@ -0,0 +1,2 @@ +bots: +- import: (data)/does-not-exist-fake-file.yaml \ No newline at end of file diff --git a/lib/policy/config/testdata/good/import_filesystem.json b/lib/policy/config/testdata/good/import_filesystem.json new file mode 100644 index 0000000..23480c9 --- /dev/null +++ b/lib/policy/config/testdata/good/import_filesystem.json @@ -0,0 +1,7 @@ +{ + "bots": [ + { + "import": "./testdata/hack-test.json" + } + ] +} \ No newline at end of file diff --git a/lib/policy/config/testdata/good/import_filesystem.yaml b/lib/policy/config/testdata/good/import_filesystem.yaml new file mode 100644 index 0000000..422ccc4 --- /dev/null +++ b/lib/policy/config/testdata/good/import_filesystem.yaml @@ -0,0 +1,2 @@ +bots: +- import: ./testdata/hack-test.yaml \ No newline at end of file diff --git a/lib/policy/config/testdata/good/import_keep_internet_working.json b/lib/policy/config/testdata/good/import_keep_internet_working.json new file mode 100644 index 0000000..68ff2db --- /dev/null +++ b/lib/policy/config/testdata/good/import_keep_internet_working.json @@ -0,0 +1,7 @@ +{ + "bots": [ + { + "import": "(data)/common/keep-internet-working.yaml" + } + ] +} \ No newline at end of file diff --git a/lib/policy/config/testdata/good/import_keep_internet_working.yaml b/lib/policy/config/testdata/good/import_keep_internet_working.yaml new file mode 100644 index 0000000..923ffe3 --- /dev/null +++ b/lib/policy/config/testdata/good/import_keep_internet_working.yaml @@ -0,0 +1,2 @@ +bots: +- import: (data)/common/keep-internet-working.yaml \ No newline at end of file diff --git a/lib/policy/config/testdata/hack-test.json b/lib/policy/config/testdata/hack-test.json new file mode 100644 index 0000000..652dcd8 --- /dev/null +++ b/lib/policy/config/testdata/hack-test.json @@ -0,0 +1,9 @@ +[ + { + "name": "ipv6-ula", + "action": "ALLOW", + "remote_addresses": [ + "fc00::/7" + ] + } +] \ No newline at end of file diff --git a/lib/policy/config/testdata/hack-test.yaml b/lib/policy/config/testdata/hack-test.yaml new file mode 100644 index 0000000..cd4d7d0 --- /dev/null +++ b/lib/policy/config/testdata/hack-test.yaml @@ -0,0 +1,3 @@ +- name: well-known + path_regex: ^/.well-known/.*$ + action: ALLOW \ No newline at end of file diff --git a/lib/policy/policy.go b/lib/policy/policy.go index 368768b..7c45ff6 100644 --- a/lib/policy/policy.go +++ b/lib/policy/policy.go @@ -7,7 +7,6 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" - "k8s.io/apimachinery/pkg/util/yaml" "github.com/TecharoHQ/anubis/lib/policy/config" ) @@ -20,26 +19,22 @@ var ( ) type ParsedConfig struct { - orig config.Config + orig *config.Config Bots []Bot DNSBL bool DefaultDifficulty int } -func NewParsedConfig(orig config.Config) *ParsedConfig { +func NewParsedConfig(orig *config.Config) *ParsedConfig { return &ParsedConfig{ orig: orig, } } func ParseConfig(fin io.Reader, fname string, defaultDifficulty int) (*ParsedConfig, error) { - var c config.Config - if err := yaml.NewYAMLToJSONDecoder(fin).Decode(&c); err != nil { - return nil, fmt.Errorf("can't parse policy config YAML %s: %w", fname, err) - } - - if err := c.Valid(); err != nil { + c, err := config.Load(fin, fname) + if err != nil { return nil, err } diff --git a/lib/policy/testdata/hack-test.json b/lib/policy/testdata/hack-test.json new file mode 100644 index 0000000..652dcd8 --- /dev/null +++ b/lib/policy/testdata/hack-test.json @@ -0,0 +1,9 @@ +[ + { + "name": "ipv6-ula", + "action": "ALLOW", + "remote_addresses": [ + "fc00::/7" + ] + } +] \ No newline at end of file diff --git a/lib/policy/testdata/hack-test.yaml b/lib/policy/testdata/hack-test.yaml new file mode 100644 index 0000000..cd4d7d0 --- /dev/null +++ b/lib/policy/testdata/hack-test.yaml @@ -0,0 +1,3 @@ +- name: well-known + path_regex: ^/.well-known/.*$ + action: ALLOW \ No newline at end of file