feat(config): allow multi-level imports (#402)

* feat(config): allow multi-level imports Signed-off-by: Xe Iaso <me@xeiaso.net> * chore(data): fix spelling of Marginalia Signed-off-by: Xe Iaso <me@xeiaso.net> --------- Signed-off-by: Xe Iaso <me@xeiaso.net>
2025-08-03 01:38:14 -04:00 · 2025-05-02 13:57:20 -04:00 · 2025-05-02 13:57:20 -04:00 · 6e82373718
commit 6e82373718
parent f8e1000ab0
8 changed files with 89 additions and 50 deletions
--- a/data/botPolicies.json
+++ b/data/botPolicies.json
@ -1,40 +1,13 @@
 {
  "bots": [
+    {
+      "import": "(data)/bots/_deny-pathological.yaml"
+    },
    {
      "import": "(data)/bots/ai-robots-txt.yaml"
    },
    {
-      "import": "(data)/bots/cloudflare-workers.yaml"
-    },
-    {
-      "import": "(data)/bots/headless-browsers.yaml"
-    },
-    {
-      "import": "(data)/bots/us-ai-scraper.yaml"
-    },
-    {
-      "import": "(data)/crawlers/googlebot.yaml"
-    },
-    {
-      "import": "(data)/crawlers/bingbot.yaml"
-    },
-    {
-      "import": "(data)/crawlers/duckduckbot.yaml"
-    },
-    {
-      "import": "(data)/crawlers/qwantbot.yaml"
-    },
-    {
-      "import": "(data)/crawlers/internet-archive.yaml"
-    },
-    {
-      "import": "(data)/crawlers/kagibot.yaml"
-    },
-    {
-      "import": "(data)/crawlers/marginalia.yaml"
-    },
-    {
-      "import": "(data)/crawlers/mojeekbot.yaml"
+      "import": "(data)/crawlers/_allow-good.yaml"
    },
    {
      "import": "(data)/common/keep-internet-working.yaml"
@ -45,5 +18,9 @@
      "action": "CHALLENGE"
    }
  ],
-  "dnsbl": false
-}
+  "dnsbl": false,
+  "status_codes": {
+    "CHALLENGE": 200,
+    "DENY": 200
+  }
+}
--- a/data/botPolicies.yaml
+++ b/data/botPolicies.yaml
@ -12,21 +12,23 @@

 bots:
 # Pathological bots to deny
- # This correlates to data/bots/ai-robots-txt.yaml in the source tree
-  import: (data)/bots/ai-robots-txt.yaml
- import: (data)/bots/cloudflare-workers.yaml 
- import: (data)/bots/headless-browsers.yaml
- import: (data)/bots/us-ai-scraper.yaml
+- # This correlates to data/bots/deny-pathological.yaml in the source tree
+  # https://github.com/TecharoHQ/anubis/blob/main/data/bots/deny-pathological.yaml
+  import: (data)/bots/_deny-pathological.yaml

-# Search engines to allow
- import: (data)/crawlers/googlebot.yaml
- import: (data)/crawlers/bingbot.yaml
- import: (data)/crawlers/duckduckbot.yaml
- import: (data)/crawlers/qwantbot.yaml
- import: (data)/crawlers/internet-archive.yaml
- import: (data)/crawlers/kagibot.yaml
- import: (data)/crawlers/marginalia.yaml
- import: (data)/crawlers/mojeekbot.yaml
+# Enforce https://github.com/ai-robots-txt/ai.robots.txt
+- import: (data)/bots/ai-robots-txt.yaml
+
+# Search engine crawlers to allow, defaults to:
+#   - Google (so they don't try to bypass Anubis)
+#   - Bing
+#   - DuckDuckGo
+#   - Qwant
+#   - The Internet Archive
+#   - Kagi
+#   - Marginalia
+#   - Mojeek
+- import: (data)/crawlers/_allow-good.yaml

 # Allow common "keeping the internet working" routes (well-known, favicon, robots.txt)
 - import: (data)/common/keep-internet-working.yaml
--- a/data/bots/_deny-pathological.yaml
+++ b/data/bots/_deny-pathological.yaml
@ -0,0 +1,3 @@
+- import: (data)/bots/cloudflare-workers.yaml
+- import: (data)/bots/headless-browsers.yaml
+- import: (data)/bots/us-ai-scraper.yaml
--- a/data/crawlers/_allow-good.yaml
+++ b/data/crawlers/_allow-good.yaml
@ -0,0 +1,8 @@
+- import: (data)/crawlers/googlebot.yaml
+- import: (data)/crawlers/bingbot.yaml
+- import: (data)/crawlers/duckduckbot.yaml
+- import: (data)/crawlers/qwantbot.yaml
+- import: (data)/crawlers/internet-archive.yaml
+- import: (data)/crawlers/kagibot.yaml
+- import: (data)/crawlers/marginalia.yaml
+- import: (data)/crawlers/mojeekbot.yaml
--- a/data/embed.go
+++ b/data/embed.go
@ -3,6 +3,6 @@ package data
 import "embed"

 var (
-	//go:embed botPolicies.yaml botPolicies.json apps bots common crawlers
+	//go:embed botPolicies.yaml botPolicies.json all:apps all:bots all:common all:crawlers
 	BotPolicies embed.FS
 )
--- a/docs/docs/CHANGELOG.md
+++ b/docs/docs/CHANGELOG.md
@ -18,6 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added customization of authorization cookie expiration time with `--cookie-expiration-time` flag or envvar
 - Updated the `OG_PASSTHROUGH` to be true by default, thereby allowing OpenGraph tags to be passed through by default
 - Added the ability to [customize Anubis' HTTP status codes](./admin/configuration/custom-status-codes.mdx) ([#355](https://github.com/TecharoHQ/anubis/issues/355))
+- Change import syntax to allow multi-level imports

 ## v1.17.0: Asahi sas Brutus

--- a/docs/docs/admin/configuration/import.mdx
+++ b/docs/docs/admin/configuration/import.mdx
@ -79,6 +79,45 @@ config.BotOrImport: rule definition is invalid, you must set either bot rules or

 Paths can either be prefixed with `(data)` to import from the [the data folder in the Anubis source tree](https://github.com/TecharoHQ/anubis/tree/main/data) or anywhere on the filesystem. If you don't have access to the Anubis source tree, check /usr/share/docs/anubis/data or in the tarball you extracted Anubis from.

+## Importing from imports
+
+You can also import from an imported file in case you want to import an entire folder of rules at once.
+
+<Tabs>
+<TabItem value="json" label="JSON">
+
+```json
+{
+  "bots": [
+    {
+      "import": "(data)/bots/_deny-pathological.yaml"
+    }
+  ]
+}
+```
+
+</TabItem>
+<TabItem value="yaml" label="YAML" default>
+
+```yaml
+bots:
+  - import: (data)/bots/_deny-pathological.yaml
+```
+
+</TabItem>
+</Tabs>
+
+This lets you import an entire ruleset at once:
+
+```yaml
+# (data)/bots/_deny-pathological.yaml
+- import: (data)/bots/cloudflare-workers.yaml
+- import: (data)/bots/headless-browsers.yaml
+- import: (data)/bots/us-ai-scraper.yaml
+```
+
+Use this with care, you can easily get yourself into a state where Anubis recursively imports things for eternity if you are not careful. The best way to use this is to make a "root import" named `_everything.yaml` or `_allow-good.yaml` so they sort to the top. Name your meta-imports after the main verb they are enforcing so that you can glance at the configuration file and understand what it's doing.
+
 ## Writing snippets

 Snippets can be written in either JSON or YAML, with a preference for YAML. When writing a snippet, write the bot rules you want directly at the top level of the file in a list.
--- a/lib/policy/config/config.go
+++ b/lib/policy/config/config.go
@ -216,18 +216,27 @@ func (is *ImportStatement) load() error {
 	}
 	defer fin.Close()

+	var imported []BotOrImport
 	var result []BotConfig

-	if err := yaml.NewYAMLToJSONDecoder(fin).Decode(&result); err != nil {
+	if err := yaml.NewYAMLToJSONDecoder(fin).Decode(&imported); err != nil {
 		return fmt.Errorf("can't parse %s: %w", is.Import, err)
 	}

 	var errs []error

-	for _, b := range result {
+	for _, b := range imported {
 		if err := b.Valid(); err != nil {
 			errs = append(errs, err)
 		}
+
+		if b.ImportStatement != nil {
+			result = append(result, b.ImportStatement.Bots...)
+		}
+
+		if b.BotConfig != nil {
+			result = append(result, *b.BotConfig)
+		}
 	}

 	if len(errs) != 0 {