anubis/data/botPolicies.yaml

## Anubis has the ability to let you import snippets of configuration into the main
## configuration file. This allows you to break up your config into smaller parts
## that get logically assembled into one big file.
##
## Of note, a bot rule can either have inline bot configuration or import a
## bot config snippet. You cannot do both in a single bot rule.
##
## Import paths can either be prefixed with (data) to import from the common/shared
## rules in the data folder in the Anubis source tree or will point to absolute/relative
## paths in your filesystem. If you don't have access to the Anubis source tree, check
## /usr/share/docs/anubis/data or in the tarball you extracted Anubis from.

bots:
# Pathological bots to deny
- # This correlates to data/bots/deny-pathological.yaml in the source tree
  # https://github.com/TecharoHQ/anubis/blob/main/data/bots/deny-pathological.yaml
  import: (data)/bots/_deny-pathological.yaml
- import: (data)/bots/aggressive-brazilian-scrapers.yaml

# Enforce https://github.com/ai-robots-txt/ai.robots.txt
- import: (data)/bots/ai-robots-txt.yaml

# Search engine crawlers to allow, defaults to:
#   - Google (so they don't try to bypass Anubis)
#   - Bing
#   - DuckDuckGo
#   - Qwant
#   - The Internet Archive
#   - Kagi
#   - Marginalia
#   - Mojeek
- import: (data)/crawlers/_allow-good.yaml

# Allow common "keeping the internet working" routes (well-known, favicon, robots.txt)
- import: (data)/common/keep-internet-working.yaml

# # Punish any bot with "bot" in the user-agent string
# # This is known to have a high false-positive rate, use at your own risk
# - name: generic-bot-catchall
#   user_agent_regex: (?i:bot|crawler)
#   action: CHALLENGE
#   challenge:
#     difficulty: 16  # impossible
#     report_as: 4    # lie to the operator
#     algorithm: slow # intentionally waste CPU cycles and time

# Generic catchall rule
- name: generic-browser
  user_agent_regex: >-
    Mozilla|Opera
  action: CHALLENGE

dnsbl: false

# By default, send HTTP 200 back to clients that either get issued a challenge
# or a denial. This seems weird, but this is load-bearing due to the fact that
# the most aggressive scraper bots seem to really, really, want an HTTP 200 and
# will stop sending requests once they get it.
status_codes:
  CHALLENGE: 200
  DENY: 200