diff --git a/data/bots/ai-catchall.yaml b/data/bots/ai-catchall.yaml index 05dc6e1..40dc742 100644 --- a/data/bots/ai-catchall.yaml +++ b/data/bots/ai-catchall.yaml @@ -7,5 +7,5 @@ # Warning: May contain user agents that _must_ be blocked in robots.txt, or the opt-out will have no effect. - name: "ai-catchall" user_agent_regex: >- - AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|anthropic-ai|Brightbot 1.0|Bytespider|CCBot|Claude-Web|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|GoogleOther|GoogleOther-Image|GoogleOther-Video|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|NovaAct|omgili|omgilibot|Operator|PanguBot|Perplexity-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YouBot + AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|anthropic-ai|Brightbot 1.0|Bytespider|Claude-Web|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|GoogleOther|GoogleOther-Image|GoogleOther-Video|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|NovaAct|omgili|omgilibot|Operator|PanguBot|Perplexity-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YouBot action: DENY diff --git a/data/bots/ai-robots-txt.yaml b/data/bots/ai-robots-txt.yaml index 3920cf9..c330eb7 100644 --- a/data/bots/ai-robots-txt.yaml +++ b/data/bots/ai-robots-txt.yaml @@ -1,6 +1,8 @@ # Warning: Contains user agents that _must_ be blocked in robots.txt, or the opt-out will have no effect. # Note: Blocks human-directed/non-training user agents +# +# CCBot is allowed because if Common Crawl is allowed, then scrapers don't need to scrape to get the data. - name: "ai-robots-txt" user_agent_regex: >- - AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|Andibot|anthropic-ai|Applebot|Applebot-Extended|bedrockbot|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-SearchBot|Claude-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|MistralAI-User/1.0|MyCentralAIScraperBot|NovaAct|OAI-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient.com|Perplexity-User|PerplexityBot|PetalBot|PhindBot|Poseidon Research Crawler|QualifiedBot|QuillBot|quillbot.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot-BA|SemrushBot-CT|SemrushBot-OCOB|SemrushBot-SI|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot + AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|Andibot|anthropic-ai|Applebot|Applebot-Extended|bedrockbot|Brightbot 1.0|Bytespider|ChatGPT-User|Claude-SearchBot|Claude-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|MistralAI-User/1.0|MyCentralAIScraperBot|NovaAct|OAI-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient.com|Perplexity-User|PerplexityBot|PetalBot|PhindBot|Poseidon Research Crawler|QualifiedBot|QuillBot|quillbot.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot-BA|SemrushBot-CT|SemrushBot-OCOB|SemrushBot-SI|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot action: DENY diff --git a/data/crawlers/_allow-good.yaml b/data/crawlers/_allow-good.yaml index f3784cb..6ccf0ce 100644 --- a/data/crawlers/_allow-good.yaml +++ b/data/crawlers/_allow-good.yaml @@ -6,4 +6,5 @@ - import: (data)/crawlers/internet-archive.yaml - import: (data)/crawlers/kagibot.yaml - import: (data)/crawlers/marginalia.yaml -- import: (data)/crawlers/mojeekbot.yaml \ No newline at end of file +- import: (data)/crawlers/mojeekbot.yaml +- import: (data)/crawlers/commoncrawl.yaml diff --git a/data/crawlers/commoncrawl.yaml b/data/crawlers/commoncrawl.yaml new file mode 100644 index 0000000..457a5af --- /dev/null +++ b/data/crawlers/commoncrawl.yaml @@ -0,0 +1,12 @@ +- name: common-crawl + user_agent_regex: CCBot + action: ALLOW + # https://index.commoncrawl.org/ccbot.json + remote_addresses: + [ + "2600:1f28:365:80b0::/60", + "18.97.9.168/29", + "18.97.14.80/29", + "18.97.14.88/30", + "98.85.178.216/32", + ] diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md index fdd553f..0ff2804 100644 --- a/docs/docs/CHANGELOG.md +++ b/docs/docs/CHANGELOG.md @@ -23,6 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add translation for German language ([#741](https://github.com/TecharoHQ/anubis/pull/741)) - Remove the "Success" interstitial after a proof of work challenge is concluded. - Add option for forcing a specific language ([#742](https://github.com/TecharoHQ/anubis/pull/742)) +- Allow [Common Crawl](https://commoncrawl.org/) by default so scrapers have less incentive to scrape ### Potentially breaking changes