From 22c47f40d1e46688c83a9f459b419cf9524100b0 Mon Sep 17 00:00:00 2001 From: Xe Iaso Date: Wed, 28 May 2025 16:36:27 -0400 Subject: [PATCH] feat(expressions): add randInt function to allow making rules nondeterministic (#578) This seems counter-intuitive at first glance, but let me cook. One of the problems with Anubis is that the rule matching is super deterministic. This means that attackers can figure out what patterns they are hitting and change things to bypass them. The randInt function lets you have rulesets behave nondeterministically. This is a very easy way to hang yourself, but can be great to psychologically mess with scraper operators. Consider this rule: ```yaml - name: deny-lightpanda-sometimes action: DENY expression: all: - userAgent.matches("LightPanda") - randInt(16) >= 4 ``` It would match about 75% of the time. Signed-off-by: Xe Iaso --- data/botPolicies.yaml | 68 +++++++++---------- docs/docs/admin/configuration/expressions.mdx | 24 ++++++- lib/policy/config/testdata/good/entropy.yaml | 8 +++ lib/policy/expressions/environment.go | 18 +++++ 4 files changed, 83 insertions(+), 35 deletions(-) create mode 100644 lib/policy/config/testdata/good/entropy.yaml diff --git a/data/botPolicies.yaml b/data/botPolicies.yaml index 13df330..1736147 100644 --- a/data/botPolicies.yaml +++ b/data/botPolicies.yaml @@ -11,44 +11,44 @@ ## /usr/share/docs/anubis/data or in the tarball you extracted Anubis from. bots: -# Pathological bots to deny -- # This correlates to data/bots/deny-pathological.yaml in the source tree - # https://github.com/TecharoHQ/anubis/blob/main/data/bots/deny-pathological.yaml - import: (data)/bots/_deny-pathological.yaml -- import: (data)/bots/aggressive-brazilian-scrapers.yaml + # Pathological bots to deny + - # This correlates to data/bots/deny-pathological.yaml in the source tree + # https://github.com/TecharoHQ/anubis/blob/main/data/bots/deny-pathological.yaml + import: (data)/bots/_deny-pathological.yaml + - import: (data)/bots/aggressive-brazilian-scrapers.yaml -# Enforce https://github.com/ai-robots-txt/ai.robots.txt -- import: (data)/bots/ai-robots-txt.yaml + # Enforce https://github.com/ai-robots-txt/ai.robots.txt + - import: (data)/bots/ai-robots-txt.yaml -# Search engine crawlers to allow, defaults to: -# - Google (so they don't try to bypass Anubis) -# - Bing -# - DuckDuckGo -# - Qwant -# - The Internet Archive -# - Kagi -# - Marginalia -# - Mojeek -- import: (data)/crawlers/_allow-good.yaml + # Search engine crawlers to allow, defaults to: + # - Google (so they don't try to bypass Anubis) + # - Bing + # - DuckDuckGo + # - Qwant + # - The Internet Archive + # - Kagi + # - Marginalia + # - Mojeek + - import: (data)/crawlers/_allow-good.yaml -# Allow common "keeping the internet working" routes (well-known, favicon, robots.txt) -- import: (data)/common/keep-internet-working.yaml + # Allow common "keeping the internet working" routes (well-known, favicon, robots.txt) + - import: (data)/common/keep-internet-working.yaml -# # Punish any bot with "bot" in the user-agent string -# # This is known to have a high false-positive rate, use at your own risk -# - name: generic-bot-catchall -# user_agent_regex: (?i:bot|crawler) -# action: CHALLENGE -# challenge: -# difficulty: 16 # impossible -# report_as: 4 # lie to the operator -# algorithm: slow # intentionally waste CPU cycles and time + # # Punish any bot with "bot" in the user-agent string + # # This is known to have a high false-positive rate, use at your own risk + # - name: generic-bot-catchall + # user_agent_regex: (?i:bot|crawler) + # action: CHALLENGE + # challenge: + # difficulty: 16 # impossible + # report_as: 4 # lie to the operator + # algorithm: slow # intentionally waste CPU cycles and time -# Generic catchall rule -- name: generic-browser - user_agent_regex: >- - Mozilla|Opera - action: CHALLENGE + # Generic catchall rule + - name: generic-browser + user_agent_regex: >- + Mozilla|Opera + action: CHALLENGE dnsbl: false @@ -58,4 +58,4 @@ dnsbl: false # will stop sending requests once they get it. status_codes: CHALLENGE: 200 - DENY: 200 \ No newline at end of file + DENY: 200 diff --git a/docs/docs/admin/configuration/expressions.mdx b/docs/docs/admin/configuration/expressions.mdx index 4e23ab8..0786c22 100644 --- a/docs/docs/admin/configuration/expressions.mdx +++ b/docs/docs/admin/configuration/expressions.mdx @@ -143,7 +143,29 @@ Anubis would return a challenge because all of those conditions are true. ## Functions exposed to Anubis expressions -There are currently no functions from the Anubis runtime exposed to expressions. This will change in the future. +Anubis expressions can be augmented with the following functions: + +### `randInt` + +```ts +function randInt(n: int): int; +``` + +randInt returns a randomly selected integer value in the range of `[0,n)`. This is a thin wrapper around [Go's math/rand#Intn](https://pkg.go.dev/math/rand#Intn). Be careful with this as it may cause inconsistent behavior for genuine users. + +This is best applied when doing explicit block rules, eg: + +```yaml +# Denies LightPanda about 75% of the time on average +- name: deny-lightpanda-sometimes + action: DENY + expression: + all: + - userAgent.matches("LightPanda") + - randInt(16) >= 4 +``` + +It seems counter-intuitive to allow known bad clients through sometimes, but this allows you to confuse attackers by making Anubis' behavior random. Adjust the thresholds and numbers as facts and circumstances demand. ## Life advice diff --git a/lib/policy/config/testdata/good/entropy.yaml b/lib/policy/config/testdata/good/entropy.yaml new file mode 100644 index 0000000..80110c1 --- /dev/null +++ b/lib/policy/config/testdata/good/entropy.yaml @@ -0,0 +1,8 @@ +bots: + - name: total-randomness + action: ALLOW + expression: + all: + - '"Accept" in headers' + - headers["Accept"].contains("text/html") + - randInt(1) == 0 diff --git a/lib/policy/expressions/environment.go b/lib/policy/expressions/environment.go index f0ea4fd..474fd9e 100644 --- a/lib/policy/expressions/environment.go +++ b/lib/policy/expressions/environment.go @@ -1,7 +1,11 @@ package expressions import ( + "math/rand/v2" + "github.com/google/cel-go/cel" + "github.com/google/cel-go/common/types" + "github.com/google/cel-go/common/types/ref" "github.com/google/cel-go/ext" ) @@ -29,6 +33,20 @@ func NewEnvironment() (*cel.Env, error) { cel.Variable("headers", cel.MapType(cel.StringType, cel.StringType)), // Functions exposed to CEL programs: + cel.Function("randInt", + cel.Overload("randInt_int", + []*cel.Type{cel.IntType}, + cel.IntType, + cel.UnaryBinding(func(val ref.Val) ref.Val { + n, ok := val.(types.Int) + if !ok { + return types.ValOrErr(val, "value is not an integer, but is %T", val) + } + + return types.Int(rand.IntN(int(n))) + }), + ), + ), ) }