mirror of
https://github.com/TecharoHQ/anubis.git
synced 2025-08-03 01:38:14 -04:00
feat(config): allow multi-level imports (#402)
* feat(config): allow multi-level imports Signed-off-by: Xe Iaso <me@xeiaso.net> * chore(data): fix spelling of Marginalia Signed-off-by: Xe Iaso <me@xeiaso.net> --------- Signed-off-by: Xe Iaso <me@xeiaso.net>
This commit is contained in:
parent
f8e1000ab0
commit
6e82373718
@ -1,40 +1,13 @@
|
||||
{
|
||||
"bots": [
|
||||
{
|
||||
"import": "(data)/bots/_deny-pathological.yaml"
|
||||
},
|
||||
{
|
||||
"import": "(data)/bots/ai-robots-txt.yaml"
|
||||
},
|
||||
{
|
||||
"import": "(data)/bots/cloudflare-workers.yaml"
|
||||
},
|
||||
{
|
||||
"import": "(data)/bots/headless-browsers.yaml"
|
||||
},
|
||||
{
|
||||
"import": "(data)/bots/us-ai-scraper.yaml"
|
||||
},
|
||||
{
|
||||
"import": "(data)/crawlers/googlebot.yaml"
|
||||
},
|
||||
{
|
||||
"import": "(data)/crawlers/bingbot.yaml"
|
||||
},
|
||||
{
|
||||
"import": "(data)/crawlers/duckduckbot.yaml"
|
||||
},
|
||||
{
|
||||
"import": "(data)/crawlers/qwantbot.yaml"
|
||||
},
|
||||
{
|
||||
"import": "(data)/crawlers/internet-archive.yaml"
|
||||
},
|
||||
{
|
||||
"import": "(data)/crawlers/kagibot.yaml"
|
||||
},
|
||||
{
|
||||
"import": "(data)/crawlers/marginalia.yaml"
|
||||
},
|
||||
{
|
||||
"import": "(data)/crawlers/mojeekbot.yaml"
|
||||
"import": "(data)/crawlers/_allow-good.yaml"
|
||||
},
|
||||
{
|
||||
"import": "(data)/common/keep-internet-working.yaml"
|
||||
@ -45,5 +18,9 @@
|
||||
"action": "CHALLENGE"
|
||||
}
|
||||
],
|
||||
"dnsbl": false
|
||||
}
|
||||
"dnsbl": false,
|
||||
"status_codes": {
|
||||
"CHALLENGE": 200,
|
||||
"DENY": 200
|
||||
}
|
||||
}
|
||||
|
@ -12,21 +12,23 @@
|
||||
|
||||
bots:
|
||||
# Pathological bots to deny
|
||||
- # This correlates to data/bots/ai-robots-txt.yaml in the source tree
|
||||
import: (data)/bots/ai-robots-txt.yaml
|
||||
- import: (data)/bots/cloudflare-workers.yaml
|
||||
- import: (data)/bots/headless-browsers.yaml
|
||||
- import: (data)/bots/us-ai-scraper.yaml
|
||||
- # This correlates to data/bots/deny-pathological.yaml in the source tree
|
||||
# https://github.com/TecharoHQ/anubis/blob/main/data/bots/deny-pathological.yaml
|
||||
import: (data)/bots/_deny-pathological.yaml
|
||||
|
||||
# Search engines to allow
|
||||
- import: (data)/crawlers/googlebot.yaml
|
||||
- import: (data)/crawlers/bingbot.yaml
|
||||
- import: (data)/crawlers/duckduckbot.yaml
|
||||
- import: (data)/crawlers/qwantbot.yaml
|
||||
- import: (data)/crawlers/internet-archive.yaml
|
||||
- import: (data)/crawlers/kagibot.yaml
|
||||
- import: (data)/crawlers/marginalia.yaml
|
||||
- import: (data)/crawlers/mojeekbot.yaml
|
||||
# Enforce https://github.com/ai-robots-txt/ai.robots.txt
|
||||
- import: (data)/bots/ai-robots-txt.yaml
|
||||
|
||||
# Search engine crawlers to allow, defaults to:
|
||||
# - Google (so they don't try to bypass Anubis)
|
||||
# - Bing
|
||||
# - DuckDuckGo
|
||||
# - Qwant
|
||||
# - The Internet Archive
|
||||
# - Kagi
|
||||
# - Marginalia
|
||||
# - Mojeek
|
||||
- import: (data)/crawlers/_allow-good.yaml
|
||||
|
||||
# Allow common "keeping the internet working" routes (well-known, favicon, robots.txt)
|
||||
- import: (data)/common/keep-internet-working.yaml
|
||||
|
3
data/bots/_deny-pathological.yaml
Normal file
3
data/bots/_deny-pathological.yaml
Normal file
@ -0,0 +1,3 @@
|
||||
- import: (data)/bots/cloudflare-workers.yaml
|
||||
- import: (data)/bots/headless-browsers.yaml
|
||||
- import: (data)/bots/us-ai-scraper.yaml
|
8
data/crawlers/_allow-good.yaml
Normal file
8
data/crawlers/_allow-good.yaml
Normal file
@ -0,0 +1,8 @@
|
||||
- import: (data)/crawlers/googlebot.yaml
|
||||
- import: (data)/crawlers/bingbot.yaml
|
||||
- import: (data)/crawlers/duckduckbot.yaml
|
||||
- import: (data)/crawlers/qwantbot.yaml
|
||||
- import: (data)/crawlers/internet-archive.yaml
|
||||
- import: (data)/crawlers/kagibot.yaml
|
||||
- import: (data)/crawlers/marginalia.yaml
|
||||
- import: (data)/crawlers/mojeekbot.yaml
|
@ -3,6 +3,6 @@ package data
|
||||
import "embed"
|
||||
|
||||
var (
|
||||
//go:embed botPolicies.yaml botPolicies.json apps bots common crawlers
|
||||
//go:embed botPolicies.yaml botPolicies.json all:apps all:bots all:common all:crawlers
|
||||
BotPolicies embed.FS
|
||||
)
|
||||
|
@ -18,6 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
- Added customization of authorization cookie expiration time with `--cookie-expiration-time` flag or envvar
|
||||
- Updated the `OG_PASSTHROUGH` to be true by default, thereby allowing OpenGraph tags to be passed through by default
|
||||
- Added the ability to [customize Anubis' HTTP status codes](./admin/configuration/custom-status-codes.mdx) ([#355](https://github.com/TecharoHQ/anubis/issues/355))
|
||||
- Change import syntax to allow multi-level imports
|
||||
|
||||
## v1.17.0: Asahi sas Brutus
|
||||
|
||||
|
@ -79,6 +79,45 @@ config.BotOrImport: rule definition is invalid, you must set either bot rules or
|
||||
|
||||
Paths can either be prefixed with `(data)` to import from the [the data folder in the Anubis source tree](https://github.com/TecharoHQ/anubis/tree/main/data) or anywhere on the filesystem. If you don't have access to the Anubis source tree, check /usr/share/docs/anubis/data or in the tarball you extracted Anubis from.
|
||||
|
||||
## Importing from imports
|
||||
|
||||
You can also import from an imported file in case you want to import an entire folder of rules at once.
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="json" label="JSON">
|
||||
|
||||
```json
|
||||
{
|
||||
"bots": [
|
||||
{
|
||||
"import": "(data)/bots/_deny-pathological.yaml"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="yaml" label="YAML" default>
|
||||
|
||||
```yaml
|
||||
bots:
|
||||
- import: (data)/bots/_deny-pathological.yaml
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
This lets you import an entire ruleset at once:
|
||||
|
||||
```yaml
|
||||
# (data)/bots/_deny-pathological.yaml
|
||||
- import: (data)/bots/cloudflare-workers.yaml
|
||||
- import: (data)/bots/headless-browsers.yaml
|
||||
- import: (data)/bots/us-ai-scraper.yaml
|
||||
```
|
||||
|
||||
Use this with care, you can easily get yourself into a state where Anubis recursively imports things for eternity if you are not careful. The best way to use this is to make a "root import" named `_everything.yaml` or `_allow-good.yaml` so they sort to the top. Name your meta-imports after the main verb they are enforcing so that you can glance at the configuration file and understand what it's doing.
|
||||
|
||||
## Writing snippets
|
||||
|
||||
Snippets can be written in either JSON or YAML, with a preference for YAML. When writing a snippet, write the bot rules you want directly at the top level of the file in a list.
|
||||
|
@ -216,18 +216,27 @@ func (is *ImportStatement) load() error {
|
||||
}
|
||||
defer fin.Close()
|
||||
|
||||
var imported []BotOrImport
|
||||
var result []BotConfig
|
||||
|
||||
if err := yaml.NewYAMLToJSONDecoder(fin).Decode(&result); err != nil {
|
||||
if err := yaml.NewYAMLToJSONDecoder(fin).Decode(&imported); err != nil {
|
||||
return fmt.Errorf("can't parse %s: %w", is.Import, err)
|
||||
}
|
||||
|
||||
var errs []error
|
||||
|
||||
for _, b := range result {
|
||||
for _, b := range imported {
|
||||
if err := b.Valid(); err != nil {
|
||||
errs = append(errs, err)
|
||||
}
|
||||
|
||||
if b.ImportStatement != nil {
|
||||
result = append(result, b.ImportStatement.Bots...)
|
||||
}
|
||||
|
||||
if b.BotConfig != nil {
|
||||
result = append(result, *b.BotConfig)
|
||||
}
|
||||
}
|
||||
|
||||
if len(errs) != 0 {
|
||||
|
Loading…
x
Reference in New Issue
Block a user