diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt index ab3b214..3f089dd 100644 --- a/.github/actions/spelling/expect.txt +++ b/.github/actions/spelling/expect.txt @@ -6,6 +6,7 @@ amazonbot anthro anubis anubistest +Applebot archlinux badregexes berr @@ -113,6 +114,7 @@ keypair KHTML kinda KUBECONFIG +lcj ldflags letsencrypt Lexentale diff --git a/data/botPolicies.yaml b/data/botPolicies.yaml index e054fbc..917c417 100644 --- a/data/botPolicies.yaml +++ b/data/botPolicies.yaml @@ -22,6 +22,7 @@ bots: # Search engine crawlers to allow, defaults to: # - Google (so they don't try to bypass Anubis) + # - Apple # - Bing # - DuckDuckGo # - Qwant diff --git a/data/bots/ai-robots-txt.yaml b/data/bots/ai-robots-txt.yaml index 5b3b20b..e515201 100644 --- a/data/bots/ai-robots-txt.yaml +++ b/data/bots/ai-robots-txt.yaml @@ -1,4 +1,6 @@ +# Warning: Contains user agents that _must_ be blocked in robots.txt, or the opt-out will have no effect. +# Note: Blocks human-directed/non-training user agents - name: "ai-robots-txt" user_agent_regex: >- - AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-SearchBot|Claude-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|MistralAI-User/1.0|NovaAct|OAI-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YouBot + AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|anthropic-ai|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-SearchBot|Claude-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|MistralAI-User/1.0|NovaAct|OAI-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YouBot action: DENY diff --git a/data/crawlers/_allow-good.yaml b/data/crawlers/_allow-good.yaml index f95e176..f3784cb 100644 --- a/data/crawlers/_allow-good.yaml +++ b/data/crawlers/_allow-good.yaml @@ -1,4 +1,5 @@ - import: (data)/crawlers/googlebot.yaml +- import: (data)/crawlers/applebot.yaml - import: (data)/crawlers/bingbot.yaml - import: (data)/crawlers/duckduckbot.yaml - import: (data)/crawlers/qwantbot.yaml diff --git a/data/crawlers/applebot.yaml b/data/crawlers/applebot.yaml new file mode 100644 index 0000000..e75dfe1 --- /dev/null +++ b/data/crawlers/applebot.yaml @@ -0,0 +1,20 @@ +# Indexing for search and Siri +# https://support.apple.com/en-us/119829 +- name: applebot + user_agent_regex: Applebot + action: ALLOW + # https://search.developer.apple.com/applebot.json + remote_addresses: [ + "17.241.208.160/27", + "17.241.193.160/27", + "17.241.200.160/27", + "17.22.237.0/24", + "17.22.245.0/24", + "17.22.253.0/24", + "17.241.75.0/24", + "17.241.219.0/24", + "17.241.227.0/24", + "17.246.15.0/24", + "17.246.19.0/24", + "17.246.23.0/24", + ] diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md index 2ccf61a..45a18b3 100644 --- a/docs/docs/CHANGELOG.md +++ b/docs/docs/CHANGELOG.md @@ -40,6 +40,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added OpenRC init.d script. - Added `--version` flag. - Added `anubis_proxied_requests_total` metric to count proxied requests. +- Add `Applebot` as "good" web crawler ## v1.18.0: Varis zos Galvus