mirror of
https://github.com/TecharoHQ/anubis.git
synced 2025-08-03 09:48:08 -04:00
perf: Replace internal SHA256 hashing with xxhash for 4-6x performance improvement (#676)
* perf(internal): Use FastHash for internal hashing docs: Add xxhash performance improvement to changelog entry feat(hash): Add fast non-cryptographic hash function Signed-off-by: Jason Cameron <git@jasoncameron.dev> * test(hash): add xxhash benchmarks and collision tests Signed-off-by: Jason Cameron <git@jasoncameron.dev> * Update metadata check-spelling run (pull_request) for json/hash Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com> on-behalf-of: @check-spelling <check-spelling-bot@check-spelling.dev> --------- Signed-off-by: Jason Cameron <git@jasoncameron.dev> Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
This commit is contained in:
parent
3437e575d4
commit
e2b46fc5e7
4
.github/actions/spelling/expect.txt
vendored
4
.github/actions/spelling/expect.txt
vendored
@ -36,6 +36,7 @@ celchecker
|
||||
CELPHASE
|
||||
cerr
|
||||
certresolver
|
||||
cespare
|
||||
CGNAT
|
||||
cgr
|
||||
chainguard
|
||||
@ -186,7 +187,6 @@ OCOB
|
||||
ogtags
|
||||
omgili
|
||||
omgilibot
|
||||
onionservice
|
||||
openai
|
||||
openrc
|
||||
pag
|
||||
@ -214,6 +214,7 @@ qualys
|
||||
qwant
|
||||
qwantbot
|
||||
rac
|
||||
rawler
|
||||
rcvar
|
||||
redir
|
||||
redirectscheme
|
||||
@ -264,7 +265,6 @@ thoth
|
||||
thothmock
|
||||
Tik
|
||||
Timpibot
|
||||
torproject
|
||||
traefik
|
||||
uberspace
|
||||
unixhttpd
|
||||
|
@ -24,6 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
- Add `--strip-base-prefix` flag/envvar to strip the base prefix from request paths when forwarding to target servers
|
||||
- Add `robots2policy` CLI utility to convert robots.txt files to Anubis challenge policies using CEL expressions ([#409](https://github.com/TecharoHQ/anubis/issues/409))
|
||||
- Implement GeoIP and ASN based checks via [Thoth](https://anubis.techaro.lol/docs/admin/thoth) ([#206](https://github.com/TecharoHQ/anubis/issues/206))
|
||||
- Replace internal SHA256 hashing with xxhash for 4-6x performance improvement in policy evaluation and cache operations
|
||||
|
||||
## v1.19.1: Jenomis cen Lexentale - Echo 1
|
||||
|
||||
|
6
go.mod
6
go.mod
@ -5,19 +5,21 @@ go 1.24.2
|
||||
require (
|
||||
github.com/TecharoHQ/thoth-proto v0.4.0
|
||||
github.com/a-h/templ v0.3.898
|
||||
github.com/cespare/xxhash/v2 v2.3.0
|
||||
github.com/facebookgo/flagenv v0.0.0-20160425205200-fcd59fca7456
|
||||
github.com/gaissmai/bart v0.20.4
|
||||
github.com/golang-jwt/jwt/v5 v5.2.2
|
||||
github.com/google/cel-go v0.25.0
|
||||
github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus v1.0.1
|
||||
github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.1.0
|
||||
github.com/joho/godotenv v1.5.1
|
||||
github.com/playwright-community/playwright-go v0.5200.0
|
||||
github.com/prometheus/client_golang v1.22.0
|
||||
github.com/sebest/xff v0.0.0-20210106013422-671bd2870b3a
|
||||
github.com/yl2chen/cidranger v1.0.2
|
||||
golang.org/x/net v0.41.0
|
||||
gopkg.in/yaml.v3 v3.0.1
|
||||
google.golang.org/grpc v1.72.2
|
||||
gopkg.in/yaml.v3 v3.0.1
|
||||
k8s.io/apimachinery v0.33.1
|
||||
sigs.k8s.io/yaml v1.4.0
|
||||
)
|
||||
@ -43,7 +45,6 @@ require (
|
||||
github.com/blakesmith/ar v0.0.0-20190502131153-809d4375e1fb // indirect
|
||||
github.com/cavaliergopher/cpio v1.0.1 // indirect
|
||||
github.com/cenkalti/backoff/v4 v4.3.0 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
||||
github.com/cli/browser v1.3.0 // indirect
|
||||
github.com/cli/go-gh v0.1.0 // indirect
|
||||
github.com/cloudflare/circl v1.6.1 // indirect
|
||||
@ -72,7 +73,6 @@ require (
|
||||
github.com/goreleaser/chglog v0.7.0 // indirect
|
||||
github.com/goreleaser/fileglob v1.3.0 // indirect
|
||||
github.com/goreleaser/nfpm/v2 v2.42.1 // indirect
|
||||
github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.1.0 // indirect
|
||||
github.com/huandu/xstrings v1.5.0 // indirect
|
||||
github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 // indirect
|
||||
github.com/kevinburke/ssh_config v1.2.0 // indirect
|
||||
|
@ -3,10 +3,23 @@ package internal
|
||||
import (
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"strconv"
|
||||
|
||||
"github.com/cespare/xxhash/v2"
|
||||
)
|
||||
|
||||
// SHA256sum computes a cryptographic hash. Still used for proof-of-work challenges
|
||||
// where we need the security properties of a cryptographic hash function.
|
||||
func SHA256sum(text string) string {
|
||||
hash := sha256.New()
|
||||
hash.Write([]byte(text))
|
||||
return hex.EncodeToString(hash.Sum(nil))
|
||||
}
|
||||
|
||||
// FastHash is a high-performance non-cryptographic hash function suitable for
|
||||
// internal caching, policy rule identification, and other performance-critical
|
||||
// use cases where cryptographic security is not required.
|
||||
func FastHash(text string) string {
|
||||
h := xxhash.Sum64String(text)
|
||||
return strconv.FormatUint(h, 16)
|
||||
}
|
||||
|
261
internal/hash_bench_test.go
Normal file
261
internal/hash_bench_test.go
Normal file
@ -0,0 +1,261 @@
|
||||
package internal
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// XXHash64sum is a test alias for FastHash to benchmark against SHA256
|
||||
func XXHash64sum(text string) string {
|
||||
return FastHash(text)
|
||||
}
|
||||
|
||||
// Test data that matches real usage patterns in the codebase
|
||||
var (
|
||||
// Typical policy checker inputs
|
||||
policyInputs = []string{
|
||||
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
||||
"User-Agent: bot/1.0",
|
||||
"User-Agent: GoogleBot/2.1",
|
||||
"/robots.txt",
|
||||
"/api/.*",
|
||||
"10.0.0.0/8",
|
||||
"192.168.1.0/24",
|
||||
"172.16.0.0/12",
|
||||
}
|
||||
|
||||
// Challenge data from challengeFor function
|
||||
challengeInputs = []string{
|
||||
"Accept-Language=en-US,X-Real-IP=192.168.1.100,User-Agent=Mozilla/5.0,WeekTime=2025-06-16T00:00:00Z,Fingerprint=abc123,Difficulty=5",
|
||||
"Accept-Language=fr-FR,X-Real-IP=10.0.0.50,User-Agent=Chrome/91.0,WeekTime=2025-06-16T00:00:00Z,Fingerprint=def456,Difficulty=3",
|
||||
"Accept-Language=es-ES,X-Real-IP=172.16.1.1,User-Agent=Safari/14.0,WeekTime=2025-06-16T00:00:00Z,Fingerprint=ghi789,Difficulty=7",
|
||||
}
|
||||
|
||||
// Bot rule patterns
|
||||
botRuleInputs = []string{
|
||||
"GoogleBot::path:/robots.txt",
|
||||
"BingBot::useragent:Mozilla/5.0 (compatible; bingbot/2.0)",
|
||||
"FacebookBot::headers:Accept-Language,User-Agent",
|
||||
"TwitterBot::cidr:192.168.1.0/24",
|
||||
}
|
||||
|
||||
// CEL expressions from policy rules
|
||||
celInputs = []string{
|
||||
`request.headers["User-Agent"].contains("bot")`,
|
||||
`request.path.startsWith("/api/") && request.method == "POST"`,
|
||||
`request.remoteAddress in ["192.168.1.0/24", "10.0.0.0/8"]`,
|
||||
`request.userAgent.matches(".*[Bb]ot.*") || request.userAgent.matches(".*[Cc]rawler.*")`,
|
||||
}
|
||||
|
||||
// Thoth ASN checker inputs
|
||||
asnInputs = []string{
|
||||
"ASNChecker\nAS 15169\nAS 8075\nAS 32934",
|
||||
"ASNChecker\nAS 13335\nAS 16509\nAS 14061",
|
||||
"ASNChecker\nAS 36351\nAS 20940\nAS 8100",
|
||||
}
|
||||
)
|
||||
|
||||
func BenchmarkSHA256_PolicyInputs(b *testing.B) {
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
input := policyInputs[i%len(policyInputs)]
|
||||
_ = SHA256sum(input)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkXXHash_PolicyInputs(b *testing.B) {
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
input := policyInputs[i%len(policyInputs)]
|
||||
_ = XXHash64sum(input)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkSHA256_ChallengeInputs(b *testing.B) {
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
input := challengeInputs[i%len(challengeInputs)]
|
||||
_ = SHA256sum(input)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkXXHash_ChallengeInputs(b *testing.B) {
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
input := challengeInputs[i%len(challengeInputs)]
|
||||
_ = XXHash64sum(input)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkSHA256_BotRuleInputs(b *testing.B) {
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
input := botRuleInputs[i%len(botRuleInputs)]
|
||||
_ = SHA256sum(input)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkXXHash_BotRuleInputs(b *testing.B) {
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
input := botRuleInputs[i%len(botRuleInputs)]
|
||||
_ = XXHash64sum(input)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkSHA256_CELInputs(b *testing.B) {
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
input := celInputs[i%len(celInputs)]
|
||||
_ = SHA256sum(input)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkXXHash_CELInputs(b *testing.B) {
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
input := celInputs[i%len(celInputs)]
|
||||
_ = XXHash64sum(input)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkSHA256_ASNInputs(b *testing.B) {
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
input := asnInputs[i%len(asnInputs)]
|
||||
_ = SHA256sum(input)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkXXHash_ASNInputs(b *testing.B) {
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
input := asnInputs[i%len(asnInputs)]
|
||||
_ = XXHash64sum(input)
|
||||
}
|
||||
}
|
||||
|
||||
// Benchmark the policy list hashing used in checker.go
|
||||
func BenchmarkSHA256_PolicyList(b *testing.B) {
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
var sb strings.Builder
|
||||
for _, input := range policyInputs {
|
||||
fmt.Fprintln(&sb, SHA256sum(input))
|
||||
}
|
||||
_ = SHA256sum(sb.String())
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkXXHash_PolicyList(b *testing.B) {
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
var sb strings.Builder
|
||||
for _, input := range policyInputs {
|
||||
fmt.Fprintln(&sb, XXHash64sum(input))
|
||||
}
|
||||
_ = XXHash64sum(sb.String())
|
||||
}
|
||||
}
|
||||
|
||||
// Tests that xxhash doesn't have collisions in realistic scenarios
|
||||
func TestHashCollisions(t *testing.T) {
|
||||
allInputs := append(append(append(append(policyInputs, challengeInputs...), botRuleInputs...), celInputs...), asnInputs...)
|
||||
|
||||
// Start with realistic inputs from actual usage
|
||||
xxhashHashes := make(map[string]string)
|
||||
for _, input := range allInputs {
|
||||
hash := XXHash64sum(input)
|
||||
if existing, exists := xxhashHashes[hash]; exists {
|
||||
t.Errorf("XXHash collision detected: %q and %q both hash to %s", input, existing, hash)
|
||||
}
|
||||
xxhashHashes[hash] = input
|
||||
}
|
||||
|
||||
t.Logf("Basic test: %d realistic inputs, no collisions", len(allInputs))
|
||||
|
||||
// Test similar strings that might cause hash collisions
|
||||
prefixes := []string{"User-Agent: ", "X-Real-IP: ", "Accept-Language: ", "Host: "}
|
||||
suffixes := []string{"bot", "crawler", "spider", "scraper", "Mozilla", "Chrome", "Safari", "Firefox"}
|
||||
variations := []string{"", "/1.0", "/2.0", " (compatible)", " (Windows)", " (Linux)", " (Mac)"}
|
||||
|
||||
stressCount := 0
|
||||
for _, prefix := range prefixes {
|
||||
for _, suffix := range suffixes {
|
||||
for _, variation := range variations {
|
||||
for i := 0; i < 100; i++ {
|
||||
input := fmt.Sprintf("%s%s%s-%d", prefix, suffix, variation, i)
|
||||
hash := XXHash64sum(input)
|
||||
if existing, exists := xxhashHashes[hash]; exists {
|
||||
t.Errorf("XXHash collision in stress test: %q and %q both hash to %s", input, existing, hash)
|
||||
}
|
||||
xxhashHashes[hash] = input
|
||||
stressCount++
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
t.Logf("Stress test 1: %d similar string variations, no collisions", stressCount)
|
||||
|
||||
// Test sequential patterns that might be problematic
|
||||
patterns := []string{
|
||||
"192.168.1.%d",
|
||||
"10.0.0.%d",
|
||||
"172.16.%d.1",
|
||||
"challenge-%d",
|
||||
"bot-rule-%d",
|
||||
"policy-%016x",
|
||||
"session-%016x",
|
||||
}
|
||||
|
||||
seqCount := 0
|
||||
for _, pattern := range patterns {
|
||||
for i := 0; i < 10000; i++ {
|
||||
input := fmt.Sprintf(pattern, i)
|
||||
hash := XXHash64sum(input)
|
||||
if existing, exists := xxhashHashes[hash]; exists {
|
||||
t.Errorf("XXHash collision in sequential test: %q and %q both hash to %s", input, existing, hash)
|
||||
}
|
||||
xxhashHashes[hash] = input
|
||||
seqCount++
|
||||
}
|
||||
}
|
||||
t.Logf("Stress test 2: %d sequential patterns, no collisions", seqCount)
|
||||
|
||||
totalInputs := len(allInputs) + stressCount + seqCount
|
||||
t.Logf("TOTAL: Tested %d inputs across realistic scenarios - NO COLLISIONS", totalInputs)
|
||||
}
|
||||
|
||||
// Verify xxhash output works as cache keys
|
||||
func TestXXHashFormat(t *testing.T) {
|
||||
testCases := []string{
|
||||
"short",
|
||||
"",
|
||||
"very long string with lots of content that might be used in policy checking and other internal hashing scenarios",
|
||||
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
||||
}
|
||||
|
||||
for _, input := range testCases {
|
||||
hash := XXHash64sum(input)
|
||||
|
||||
// Check it's valid hex
|
||||
if len(hash) == 0 {
|
||||
t.Errorf("Empty hash for input %q", input)
|
||||
}
|
||||
|
||||
// xxhash is 64-bit so max 16 hex chars
|
||||
if len(hash) > 16 {
|
||||
t.Errorf("Hash too long for input %q: %s (length %d)", input, hash, len(hash))
|
||||
}
|
||||
|
||||
// Make sure it's all hex characters
|
||||
for _, char := range hash {
|
||||
if !((char >= '0' && char <= '9') || (char >= 'a' && char <= 'f')) {
|
||||
t.Errorf("Non-hex character %c in hash %s for input %q", char, hash, input)
|
||||
}
|
||||
}
|
||||
|
||||
t.Logf("Input: %q -> Hash: %s", input, hash)
|
||||
}
|
||||
}
|
@ -26,7 +26,7 @@ func (c *Client) ASNCheckerFor(asns []uint32) checker.Impl {
|
||||
return &ASNChecker{
|
||||
iptoasn: c.IPToASN,
|
||||
asns: asnMap,
|
||||
hash: internal.SHA256sum(sb.String()),
|
||||
hash: internal.FastHash(sb.String()),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -90,7 +90,7 @@ func (s *Server) challengeFor(r *http.Request, difficulty int) string {
|
||||
fp,
|
||||
difficulty,
|
||||
)
|
||||
return internal.SHA256sum(challengeData)
|
||||
return internal.FastHash(challengeData)
|
||||
}
|
||||
|
||||
func (s *Server) maybeReverseProxyHttpStatusOnly(w http.ResponseWriter, r *http.Request) {
|
||||
|
@ -17,5 +17,5 @@ type Bot struct {
|
||||
}
|
||||
|
||||
func (b Bot) Hash() string {
|
||||
return internal.SHA256sum(fmt.Sprintf("%s::%s", b.Name, b.Rules.Hash()))
|
||||
return internal.FastHash(fmt.Sprintf("%s::%s", b.Name, b.Rules.Hash()))
|
||||
}
|
||||
|
@ -63,7 +63,7 @@ func NewCELChecker(cfg *config.ExpressionOrList) (*CELChecker, error) {
|
||||
}
|
||||
|
||||
func (cc *CELChecker) Hash() string {
|
||||
return internal.SHA256sum(cc.src)
|
||||
return internal.FastHash(cc.src)
|
||||
}
|
||||
|
||||
func (cc *CELChecker) Check(r *http.Request) (bool, error) {
|
||||
|
@ -28,7 +28,7 @@ func (staticHashChecker) Check(r *http.Request) (bool, error) {
|
||||
func (s staticHashChecker) Hash() string { return s.hash }
|
||||
|
||||
func NewStaticHashChecker(hashable string) checker.Impl {
|
||||
return staticHashChecker{hash: internal.SHA256sum(hashable)}
|
||||
return staticHashChecker{hash: internal.FastHash(hashable)}
|
||||
}
|
||||
|
||||
type RemoteAddrChecker struct {
|
||||
@ -55,7 +55,7 @@ func NewRemoteAddrChecker(cidrs []string) (checker.Impl, error) {
|
||||
|
||||
return &RemoteAddrChecker{
|
||||
ranger: ranger,
|
||||
hash: internal.SHA256sum(sb.String()),
|
||||
hash: internal.FastHash(sb.String()),
|
||||
}, nil
|
||||
}
|
||||
|
||||
@ -101,7 +101,7 @@ func NewHeaderMatchesChecker(header, rexStr string) (checker.Impl, error) {
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("%w: regex %s failed parse: %w", ErrMisconfiguration, rexStr, err)
|
||||
}
|
||||
return &HeaderMatchesChecker{strings.TrimSpace(header), rex, internal.SHA256sum(header + ": " + rexStr)}, nil
|
||||
return &HeaderMatchesChecker{strings.TrimSpace(header), rex, internal.FastHash(header + ": " + rexStr)}, nil
|
||||
}
|
||||
|
||||
func (hmc *HeaderMatchesChecker) Check(r *http.Request) (bool, error) {
|
||||
@ -126,7 +126,7 @@ func NewPathChecker(rexStr string) (checker.Impl, error) {
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("%w: regex %s failed parse: %w", ErrMisconfiguration, rexStr, err)
|
||||
}
|
||||
return &PathChecker{rex, internal.SHA256sum(rexStr)}, nil
|
||||
return &PathChecker{rex, internal.FastHash(rexStr)}, nil
|
||||
}
|
||||
|
||||
func (pc *PathChecker) Check(r *http.Request) (bool, error) {
|
||||
@ -158,7 +158,7 @@ func (hec headerExistsChecker) Check(r *http.Request) (bool, error) {
|
||||
}
|
||||
|
||||
func (hec headerExistsChecker) Hash() string {
|
||||
return internal.SHA256sum(hec.header)
|
||||
return internal.FastHash(hec.header)
|
||||
}
|
||||
|
||||
func NewHeadersChecker(headermap map[string]string) (checker.Impl, error) {
|
||||
@ -177,7 +177,7 @@ func NewHeadersChecker(headermap map[string]string) (checker.Impl, error) {
|
||||
continue
|
||||
}
|
||||
|
||||
result = append(result, &HeaderMatchesChecker{key, rex, internal.SHA256sum(key + ": " + rexStr)})
|
||||
result = append(result, &HeaderMatchesChecker{key, rex, internal.FastHash(key + ": " + rexStr)})
|
||||
}
|
||||
|
||||
if len(errs) != 0 {
|
||||
|
@ -37,5 +37,5 @@ func (l List) Hash() string {
|
||||
fmt.Fprintln(&sb, c.Hash())
|
||||
}
|
||||
|
||||
return internal.SHA256sum(sb.String())
|
||||
return internal.FastHash(sb.String())
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user