diff --git a/data/bots/ai-robots-txt.yaml b/data/bots/ai-robots-txt.yaml index 16df8cb..3920cf9 100644 --- a/data/bots/ai-robots-txt.yaml +++ b/data/bots/ai-robots-txt.yaml @@ -2,5 +2,5 @@ # Note: Blocks human-directed/non-training user agents - name: "ai-robots-txt" user_agent_regex: >- - AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|Andibot|anthropic-ai|Applebot|Applebot-Extended|bedrockbot|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-SearchBot|Claude-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|MistralAI-User/1.0|NovaAct|OAI-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient.com|Perplexity-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot.com|SBIntuitionsBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot + AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|Andibot|anthropic-ai|Applebot|Applebot-Extended|bedrockbot|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-SearchBot|Claude-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|MistralAI-User/1.0|MyCentralAIScraperBot|NovaAct|OAI-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient.com|Perplexity-User|PerplexityBot|PetalBot|PhindBot|Poseidon Research Crawler|QualifiedBot|QuillBot|quillbot.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot-BA|SemrushBot-CT|SemrushBot-OCOB|SemrushBot-SI|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot action: DENY diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md index d714c94..e6fe5e8 100644 --- a/docs/docs/CHANGELOG.md +++ b/docs/docs/CHANGELOG.md @@ -11,7 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -- Replace cidranger with bart for IP range checking, improving IP matching performance by 3-20x with zero heap +- Replace cidranger with bart for IP range checking, improving IP matching performance by 3-20x with zero heap allocations - Remove the unused `/test-error` endpoint and update the testing endpoint `/make-challenge` to only be enabled in development @@ -20,7 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Refactor challenge presentation logic to use a challenge registry - Allow challenge implementations to register HTTP routes - Implement a no-JS challenge method: [`metarefresh`](./admin/configuration/challenges/metarefresh.mdx) ([#95](https://github.com/TecharoHQ/anubis/issues/95)) -- Bump AI-robots.txt to version 1.34 +- Bump AI-robots.txt to version 1.37 - Make progress bar styling more compatible (UXP, etc) - Optimized the OGTags subsystem with reduced allocations and runtime per request by up to 66% - Add `--strip-base-prefix` flag/envvar to strip the base prefix from request paths when forwarding to target servers diff --git a/web/static/robots.txt b/web/static/robots.txt index 1b0f841..f09f0f9 100644 --- a/web/static/robots.txt +++ b/web/static/robots.txt @@ -21,7 +21,9 @@ User-agent: Cotoyogi User-agent: Crawlspace User-agent: Diffbot User-agent: DuckAssistBot +User-agent: EchoboxBot User-agent: FacebookBot +User-agent: facebookexternalhit User-agent: Factset_spyderbot User-agent: FirecrawlAgent User-agent: FriendlyCrawler @@ -42,6 +44,7 @@ User-agent: Meta-ExternalAgent User-agent: meta-externalfetcher User-agent: Meta-ExternalFetcher User-agent: MistralAI-User/1.0 +User-agent: MyCentralAIScraperBot User-agent: NovaAct User-agent: OAI-SearchBot User-agent: omgili @@ -54,12 +57,17 @@ User-agent: Perplexity-User User-agent: PerplexityBot User-agent: PetalBot User-agent: PhindBot +User-agent: Poseidon Research Crawler User-agent: QualifiedBot User-agent: QuillBot User-agent: quillbot.com User-agent: SBIntuitionsBot User-agent: Scrapy +User-agent: SemrushBot +User-agent: SemrushBot-BA +User-agent: SemrushBot-CT User-agent: SemrushBot-OCOB +User-agent: SemrushBot-SI User-agent: SemrushBot-SWA User-agent: Sidetrade indexer bot User-agent: TikTokSpider