Add Applebot definition (#589)

* Add Applebot definition Adds Apple's search indexing bot, and allowlists it by default. Allowlisted by default because it is equivalent to Googlebot/Bingbot. Remove Applebot from `ai-robots-txt.yaml` for the same reasons. Remove `Applebot-Extended` from `ai-robots-txt.yaml` as it has no effect. * chore: spelling Signed-off-by: Xe Iaso <me@xeiaso.net> --------- Signed-off-by: Xe Iaso <me@xeiaso.net> Co-authored-by: Xe Iaso <me@xeiaso.net>
2025-09-22 03:12:47 -04:00 · 2025-05-31 07:18:32 -07:00 · 2025-05-31 07:18:32 -07:00 · 68a71c6a99
commit 68a71c6a99
parent fbbab5a035
6 changed files with 28 additions and 1 deletions
--- a/.github/actions/spelling/expect.txt
+++ b/.github/actions/spelling/expect.txt
@ -6,6 +6,7 @@ amazonbot
 anthro
 anubis
 anubistest
+Applebot
 archlinux
 badregexes
 berr
@ -113,6 +114,7 @@ keypair
 KHTML
 kinda
 KUBECONFIG
+lcj
 ldflags
 letsencrypt
 Lexentale
--- a/data/botPolicies.yaml
+++ b/data/botPolicies.yaml
@ -22,6 +22,7 @@ bots:

  # Search engine crawlers to allow, defaults to:
  #   - Google (so they don't try to bypass Anubis)
+  #   - Apple
  #   - Bing
  #   - DuckDuckGo
  #   - Qwant
--- a/data/bots/ai-robots-txt.yaml
+++ b/data/bots/ai-robots-txt.yaml
@ -1,4 +1,6 @@
+# Warning: Contains user agents that _must_ be blocked in robots.txt, or the opt-out will have no effect.
+# Note: Blocks human-directed/non-training user agents
 - name: "ai-robots-txt"
  user_agent_regex: >-
-    AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-SearchBot|Claude-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|MistralAI-User/1.0|NovaAct|OAI-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YouBot
+    AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|anthropic-ai|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-SearchBot|Claude-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|MistralAI-User/1.0|NovaAct|OAI-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YouBot
  action: DENY
--- a/data/crawlers/_allow-good.yaml
+++ b/data/crawlers/_allow-good.yaml
@ -1,4 +1,5 @@
 - import: (data)/crawlers/googlebot.yaml
+- import: (data)/crawlers/applebot.yaml
 - import: (data)/crawlers/bingbot.yaml
 - import: (data)/crawlers/duckduckbot.yaml
 - import: (data)/crawlers/qwantbot.yaml
--- a/data/crawlers/applebot.yaml
+++ b/data/crawlers/applebot.yaml
@ -0,0 +1,20 @@
+# Indexing for search and Siri
+# https://support.apple.com/en-us/119829
+- name: applebot
+  user_agent_regex: Applebot
+  action: ALLOW
+  # https://search.developer.apple.com/applebot.json
+  remote_addresses: [
+    "17.241.208.160/27",
+    "17.241.193.160/27",
+    "17.241.200.160/27",
+    "17.22.237.0/24",
+    "17.22.245.0/24",
+    "17.22.253.0/24",
+    "17.241.75.0/24",
+    "17.241.219.0/24",
+    "17.241.227.0/24",
+    "17.246.15.0/24",
+    "17.246.19.0/24",
+    "17.246.23.0/24",
+  ]
--- a/docs/docs/CHANGELOG.md
+++ b/docs/docs/CHANGELOG.md
@ -40,6 +40,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added OpenRC init.d script.
 - Added `--version` flag.
 - Added `anubis_proxied_requests_total` metric to count proxied requests.
+- Add `Applebot` as "good" web crawler

 ## v1.18.0: Varis zos Galvus