mirror of
https://github.com/TecharoHQ/anubis.git
synced 2025-08-03 01:38:14 -04:00
chore(default-config): allowlist common crawl (#753)
This may seem strange, but allowlisting common crawl means that scrapers have less incentive to scrape because they can just grab the data from common crawl instead of scraping it again.
This commit is contained in:
parent
d7a758f805
commit
7c0996448a
@ -7,5 +7,5 @@
|
||||
# Warning: May contain user agents that _must_ be blocked in robots.txt, or the opt-out will have no effect.
|
||||
- name: "ai-catchall"
|
||||
user_agent_regex: >-
|
||||
AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|anthropic-ai|Brightbot 1.0|Bytespider|CCBot|Claude-Web|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|GoogleOther|GoogleOther-Image|GoogleOther-Video|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|NovaAct|omgili|omgilibot|Operator|PanguBot|Perplexity-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YouBot
|
||||
AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|anthropic-ai|Brightbot 1.0|Bytespider|Claude-Web|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|GoogleOther|GoogleOther-Image|GoogleOther-Video|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|NovaAct|omgili|omgilibot|Operator|PanguBot|Perplexity-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YouBot
|
||||
action: DENY
|
||||
|
@ -1,6 +1,8 @@
|
||||
# Warning: Contains user agents that _must_ be blocked in robots.txt, or the opt-out will have no effect.
|
||||
# Note: Blocks human-directed/non-training user agents
|
||||
#
|
||||
# CCBot is allowed because if Common Crawl is allowed, then scrapers don't need to scrape to get the data.
|
||||
- name: "ai-robots-txt"
|
||||
user_agent_regex: >-
|
||||
AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|Andibot|anthropic-ai|Applebot|Applebot-Extended|bedrockbot|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-SearchBot|Claude-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|MistralAI-User/1.0|MyCentralAIScraperBot|NovaAct|OAI-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient.com|Perplexity-User|PerplexityBot|PetalBot|PhindBot|Poseidon Research Crawler|QualifiedBot|QuillBot|quillbot.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot-BA|SemrushBot-CT|SemrushBot-OCOB|SemrushBot-SI|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot
|
||||
AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|Andibot|anthropic-ai|Applebot|Applebot-Extended|bedrockbot|Brightbot 1.0|Bytespider|ChatGPT-User|Claude-SearchBot|Claude-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|MistralAI-User/1.0|MyCentralAIScraperBot|NovaAct|OAI-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient.com|Perplexity-User|PerplexityBot|PetalBot|PhindBot|Poseidon Research Crawler|QualifiedBot|QuillBot|quillbot.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot-BA|SemrushBot-CT|SemrushBot-OCOB|SemrushBot-SI|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot
|
||||
action: DENY
|
||||
|
@ -6,4 +6,5 @@
|
||||
- import: (data)/crawlers/internet-archive.yaml
|
||||
- import: (data)/crawlers/kagibot.yaml
|
||||
- import: (data)/crawlers/marginalia.yaml
|
||||
- import: (data)/crawlers/mojeekbot.yaml
|
||||
- import: (data)/crawlers/mojeekbot.yaml
|
||||
- import: (data)/crawlers/commoncrawl.yaml
|
||||
|
12
data/crawlers/commoncrawl.yaml
Normal file
12
data/crawlers/commoncrawl.yaml
Normal file
@ -0,0 +1,12 @@
|
||||
- name: common-crawl
|
||||
user_agent_regex: CCBot
|
||||
action: ALLOW
|
||||
# https://index.commoncrawl.org/ccbot.json
|
||||
remote_addresses:
|
||||
[
|
||||
"2600:1f28:365:80b0::/60",
|
||||
"18.97.9.168/29",
|
||||
"18.97.14.80/29",
|
||||
"18.97.14.88/30",
|
||||
"98.85.178.216/32",
|
||||
]
|
@ -23,6 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
- Add translation for German language ([#741](https://github.com/TecharoHQ/anubis/pull/741))
|
||||
- Remove the "Success" interstitial after a proof of work challenge is concluded.
|
||||
- Add option for forcing a specific language ([#742](https://github.com/TecharoHQ/anubis/pull/742))
|
||||
- Allow [Common Crawl](https://commoncrawl.org/) by default so scrapers have less incentive to scrape
|
||||
|
||||
### Potentially breaking changes
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user