From 7c0996448a3b3f544e7c5f277a4d5641b61cd561 Mon Sep 17 00:00:00 2001 From: Xe Iaso Date: Fri, 4 Jul 2025 00:10:45 +0000 Subject: [PATCH] chore(default-config): allowlist common crawl (#753) This may seem strange, but allowlisting common crawl means that scrapers have less incentive to scrape because they can just grab the data from common crawl instead of scraping it again. --- data/bots/ai-catchall.yaml | 2 +- data/bots/ai-robots-txt.yaml | 4 +++- data/crawlers/_allow-good.yaml | 3 ++- data/crawlers/commoncrawl.yaml | 12 ++++++++++++ docs/docs/CHANGELOG.md | 1 + 5 files changed, 19 insertions(+), 3 deletions(-) create mode 100644 data/crawlers/commoncrawl.yaml diff --git a/data/bots/ai-catchall.yaml b/data/bots/ai-catchall.yaml index 05dc6e1..40dc742 100644 --- a/data/bots/ai-catchall.yaml +++ b/data/bots/ai-catchall.yaml @@ -7,5 +7,5 @@ # Warning: May contain user agents that _must_ be blocked in robots.txt, or the opt-out will have no effect. - name: "ai-catchall" user_agent_regex: >- - AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|anthropic-ai|Brightbot 1.0|Bytespider|CCBot|Claude-Web|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|GoogleOther|GoogleOther-Image|GoogleOther-Video|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|NovaAct|omgili|omgilibot|Operator|PanguBot|Perplexity-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YouBot + AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|anthropic-ai|Brightbot 1.0|Bytespider|Claude-Web|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|GoogleOther|GoogleOther-Image|GoogleOther-Video|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|NovaAct|omgili|omgilibot|Operator|PanguBot|Perplexity-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YouBot action: DENY diff --git a/data/bots/ai-robots-txt.yaml b/data/bots/ai-robots-txt.yaml index 3920cf9..c330eb7 100644 --- a/data/bots/ai-robots-txt.yaml +++ b/data/bots/ai-robots-txt.yaml @@ -1,6 +1,8 @@ # Warning: Contains user agents that _must_ be blocked in robots.txt, or the opt-out will have no effect. # Note: Blocks human-directed/non-training user agents +# +# CCBot is allowed because if Common Crawl is allowed, then scrapers don't need to scrape to get the data. - name: "ai-robots-txt" user_agent_regex: >- - AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|Andibot|anthropic-ai|Applebot|Applebot-Extended|bedrockbot|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-SearchBot|Claude-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|MistralAI-User/1.0|MyCentralAIScraperBot|NovaAct|OAI-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient.com|Perplexity-User|PerplexityBot|PetalBot|PhindBot|Poseidon Research Crawler|QualifiedBot|QuillBot|quillbot.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot-BA|SemrushBot-CT|SemrushBot-OCOB|SemrushBot-SI|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot + AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|Andibot|anthropic-ai|Applebot|Applebot-Extended|bedrockbot|Brightbot 1.0|Bytespider|ChatGPT-User|Claude-SearchBot|Claude-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|MistralAI-User/1.0|MyCentralAIScraperBot|NovaAct|OAI-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient.com|Perplexity-User|PerplexityBot|PetalBot|PhindBot|Poseidon Research Crawler|QualifiedBot|QuillBot|quillbot.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot-BA|SemrushBot-CT|SemrushBot-OCOB|SemrushBot-SI|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot action: DENY diff --git a/data/crawlers/_allow-good.yaml b/data/crawlers/_allow-good.yaml index f3784cb..6ccf0ce 100644 --- a/data/crawlers/_allow-good.yaml +++ b/data/crawlers/_allow-good.yaml @@ -6,4 +6,5 @@ - import: (data)/crawlers/internet-archive.yaml - import: (data)/crawlers/kagibot.yaml - import: (data)/crawlers/marginalia.yaml -- import: (data)/crawlers/mojeekbot.yaml \ No newline at end of file +- import: (data)/crawlers/mojeekbot.yaml +- import: (data)/crawlers/commoncrawl.yaml diff --git a/data/crawlers/commoncrawl.yaml b/data/crawlers/commoncrawl.yaml new file mode 100644 index 0000000..457a5af --- /dev/null +++ b/data/crawlers/commoncrawl.yaml @@ -0,0 +1,12 @@ +- name: common-crawl + user_agent_regex: CCBot + action: ALLOW + # https://index.commoncrawl.org/ccbot.json + remote_addresses: + [ + "2600:1f28:365:80b0::/60", + "18.97.9.168/29", + "18.97.14.80/29", + "18.97.14.88/30", + "98.85.178.216/32", + ] diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md index fdd553f..0ff2804 100644 --- a/docs/docs/CHANGELOG.md +++ b/docs/docs/CHANGELOG.md @@ -23,6 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add translation for German language ([#741](https://github.com/TecharoHQ/anubis/pull/741)) - Remove the "Success" interstitial after a proof of work challenge is concluded. - Add option for forcing a specific language ([#742](https://github.com/TecharoHQ/anubis/pull/742)) +- Allow [Common Crawl](https://commoncrawl.org/) by default so scrapers have less incentive to scrape ### Potentially breaking changes