From 76514f9f327f8639622a28b26e2cf5ffd806bfcc Mon Sep 17 00:00:00 2001 From: Dryusdan Date: Mon, 28 Apr 2025 02:52:08 +0200 Subject: [PATCH] Bump AI-robots.txt rules to version 1.29 (#383) --- data/bots/ai-robots-txt.yaml | 4 ++-- web/static/robots.txt | 18 ++++++++++++++++-- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/data/bots/ai-robots-txt.yaml b/data/bots/ai-robots-txt.yaml index ef2790c..5beb090 100644 --- a/data/bots/ai-robots-txt.yaml +++ b/data/bots/ai-robots-txt.yaml @@ -1,4 +1,4 @@ - name: "ai-robots-txt" user_agent_regex: >- - AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|Perplexity-User|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot - action: DENY \ No newline at end of file + AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|NovaAct|OAI-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity-User|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot + action: DENY diff --git a/web/static/robots.txt b/web/static/robots.txt index 4d0861c..e9dd073 100644 --- a/web/static/robots.txt +++ b/web/static/robots.txt @@ -1,9 +1,11 @@ User-agent: AI2Bot User-agent: Ai2Bot-Dolma +User-agent: aiHitBot User-agent: Amazonbot User-agent: anthropic-ai User-agent: Applebot User-agent: Applebot-Extended +User-agent: Brightbot 1.0 User-agent: Bytespider User-agent: CCBot User-agent: ChatGPT-User @@ -11,9 +13,13 @@ User-agent: Claude-Web User-agent: ClaudeBot User-agent: cohere-ai User-agent: cohere-training-data-crawler +User-agent: Cotoyogi +User-agent: Crawlspace User-agent: Diffbot User-agent: DuckAssistBot User-agent: FacebookBot +User-agent: Factset_spyderbot +User-agent: FirecrawlAgent User-agent: FriendlyCrawler User-agent: Google-Extended User-agent: GoogleOther @@ -24,19 +30,27 @@ User-agent: iaskspider/2.0 User-agent: ICC-Crawler User-agent: ImagesiftBot User-agent: img2dataset +User-agent: imgproxy User-agent: ISSCyberRiskCrawler User-agent: Kangaroo Bot +User-agent: meta-externalagent User-agent: Meta-ExternalAgent +User-agent: meta-externalfetcher User-agent: Meta-ExternalFetcher +User-agent: NovaAct User-agent: OAI-SearchBot User-agent: omgili User-agent: omgilibot +User-agent: Operator User-agent: PanguBot +User-agent: Perplexity-User User-agent: PerplexityBot User-agent: PetalBot User-agent: Scrapy -User-agent: SemrushBot +User-agent: SemrushBot-OCOB +User-agent: SemrushBot-SWA User-agent: Sidetrade indexer bot +User-agent: TikTokSpider User-agent: Timpibot User-agent: VelenPublicWebCrawler User-agent: Webzio-Extended @@ -44,4 +58,4 @@ User-agent: YouBot Disallow: / User-agent: * -Disallow: / \ No newline at end of file +Disallow: /