diff --git a/data/botPolicies.json b/data/botPolicies.json index 7d6e4cb..dbc3d35 100644 --- a/data/botPolicies.json +++ b/data/botPolicies.json @@ -1,8 +1,8 @@ { "bots": [ { - "name": "amazonbot", - "user_agent_regex": "Amazonbot", + "name": "ai-robots-txt", + "user_agent_regex": "AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|Perplexity-User|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot", "action": "DENY" }, { diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md index ebcd35c..94cf468 100644 --- a/docs/docs/CHANGELOG.md +++ b/docs/docs/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- Add more AI user agents based on the [ai.robots.txt](https://github.com/ai-robots-txt/ai.robots.txt) project - Embedded challenge data in initial HTML response to improve performance - Whitelisted [DuckDuckBot](https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot/) in botPolicies - Improvements to build scripts to make them less independent of the build host diff --git a/internal/test/playwright_test.go b/internal/test/playwright_test.go index 7859b71..69652ce 100644 --- a/internal/test/playwright_test.go +++ b/internal/test/playwright_test.go @@ -52,6 +52,24 @@ var ( realIP: placeholderIP, userAgent: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/120.0.6099.28 Safari/537.36", }, + { + name: "Amazonbot", + action: actionDeny, + realIP: placeholderIP, + userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/600.2.5 (KHTML, like Gecko) Version/8.0.2 Safari/600.2.5 (Amazonbot/0.1; +https://developer.amazon.com/support/amazonbot)", + }, + { + name: "Amazonbot", + action: actionDeny, + realIP: placeholderIP, + userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/600.2.5 (KHTML, like Gecko) Version/8.0.2 Safari/600.2.5 (Amazonbot/0.1; +https://developer.amazon.com/support/amazonbot)", + }, + { + name: "PerplexityAI", + action: actionDeny, + realIP: placeholderIP, + userAgent: "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; PerplexityBot/1.0; +https://perplexity.ai/perplexitybot)", + }, { name: "kagiBadIP", action: actionChallenge,