From 7c0996448a3b3f544e7c5f277a4d5641b61cd561 Mon Sep 17 00:00:00 2001
From: Xe Iaso <xe.iaso@techaro.lol>
Date: Fri, 4 Jul 2025 00:10:45 +0000
Subject: [PATCH] chore(default-config): allowlist common crawl (#753)

This may seem strange, but allowlisting common crawl means that scrapers
have less incentive to scrape because they can just grab the data from
common crawl instead of scraping it again.
---
 data/bots/ai-catchall.yaml     |  2 +-
 data/bots/ai-robots-txt.yaml   |  4 +++-
 data/crawlers/_allow-good.yaml |  3 ++-
 data/crawlers/commoncrawl.yaml | 12 ++++++++++++
 docs/docs/CHANGELOG.md         |  1 +
 5 files changed, 19 insertions(+), 3 deletions(-)
 create mode 100644 data/crawlers/commoncrawl.yaml

diff --git a/data/bots/ai-catchall.yaml b/data/bots/ai-catchall.yaml
index 05dc6e1..40dc742 100644
--- a/data/bots/ai-catchall.yaml
+++ b/data/bots/ai-catchall.yaml
@@ -7,5 +7,5 @@
 # Warning: May contain user agents that _must_ be blocked in robots.txt, or the opt-out will have no effect.
 - name: "ai-catchall"
   user_agent_regex: >-
-    AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|anthropic-ai|Brightbot 1.0|Bytespider|CCBot|Claude-Web|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|GoogleOther|GoogleOther-Image|GoogleOther-Video|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|NovaAct|omgili|omgilibot|Operator|PanguBot|Perplexity-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YouBot
+    AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|anthropic-ai|Brightbot 1.0|Bytespider|Claude-Web|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|GoogleOther|GoogleOther-Image|GoogleOther-Video|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|NovaAct|omgili|omgilibot|Operator|PanguBot|Perplexity-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YouBot
   action: DENY
diff --git a/data/bots/ai-robots-txt.yaml b/data/bots/ai-robots-txt.yaml
index 3920cf9..c330eb7 100644
--- a/data/bots/ai-robots-txt.yaml
+++ b/data/bots/ai-robots-txt.yaml
@@ -1,6 +1,8 @@
 # Warning: Contains user agents that _must_ be blocked in robots.txt, or the opt-out will have no effect.
 # Note: Blocks human-directed/non-training user agents
+#
+# CCBot is allowed because if Common Crawl is allowed, then scrapers don't need to scrape to get the data.
 - name: "ai-robots-txt"
   user_agent_regex: >-
-    AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|Andibot|anthropic-ai|Applebot|Applebot-Extended|bedrockbot|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-SearchBot|Claude-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|MistralAI-User/1.0|MyCentralAIScraperBot|NovaAct|OAI-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient.com|Perplexity-User|PerplexityBot|PetalBot|PhindBot|Poseidon Research Crawler|QualifiedBot|QuillBot|quillbot.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot-BA|SemrushBot-CT|SemrushBot-OCOB|SemrushBot-SI|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot
+    AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|Andibot|anthropic-ai|Applebot|Applebot-Extended|bedrockbot|Brightbot 1.0|Bytespider|ChatGPT-User|Claude-SearchBot|Claude-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|MistralAI-User/1.0|MyCentralAIScraperBot|NovaAct|OAI-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient.com|Perplexity-User|PerplexityBot|PetalBot|PhindBot|Poseidon Research Crawler|QualifiedBot|QuillBot|quillbot.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot-BA|SemrushBot-CT|SemrushBot-OCOB|SemrushBot-SI|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot
   action: DENY
diff --git a/data/crawlers/_allow-good.yaml b/data/crawlers/_allow-good.yaml
index f3784cb..6ccf0ce 100644
--- a/data/crawlers/_allow-good.yaml
+++ b/data/crawlers/_allow-good.yaml
@@ -6,4 +6,5 @@
 - import: (data)/crawlers/internet-archive.yaml
 - import: (data)/crawlers/kagibot.yaml
 - import: (data)/crawlers/marginalia.yaml
-- import: (data)/crawlers/mojeekbot.yaml
\ No newline at end of file
+- import: (data)/crawlers/mojeekbot.yaml
+- import: (data)/crawlers/commoncrawl.yaml
diff --git a/data/crawlers/commoncrawl.yaml b/data/crawlers/commoncrawl.yaml
new file mode 100644
index 0000000..457a5af
--- /dev/null
+++ b/data/crawlers/commoncrawl.yaml
@@ -0,0 +1,12 @@
+- name: common-crawl
+  user_agent_regex: CCBot
+  action: ALLOW
+  # https://index.commoncrawl.org/ccbot.json
+  remote_addresses:
+    [
+      "2600:1f28:365:80b0::/60",
+      "18.97.9.168/29",
+      "18.97.14.80/29",
+      "18.97.14.88/30",
+      "98.85.178.216/32",
+    ]
diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md
index fdd553f..0ff2804 100644
--- a/docs/docs/CHANGELOG.md
+++ b/docs/docs/CHANGELOG.md
@@ -23,6 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Add translation for German language ([#741](https://github.com/TecharoHQ/anubis/pull/741))
 - Remove the "Success" interstitial after a proof of work challenge is concluded.
 - Add option for forcing a specific language ([#742](https://github.com/TecharoHQ/anubis/pull/742))
+- Allow [Common Crawl](https://commoncrawl.org/) by default so scrapers have less incentive to scrape
 
 ### Potentially breaking changes