From e2b46fc5e77b32125ed3355008d8f3e0b62a7dd3 Mon Sep 17 00:00:00 2001 From: Jason Cameron Date: Mon, 16 Jun 2025 22:53:53 -0400 Subject: [PATCH] perf: Replace internal SHA256 hashing with xxhash for 4-6x performance improvement (#676) * perf(internal): Use FastHash for internal hashing docs: Add xxhash performance improvement to changelog entry feat(hash): Add fast non-cryptographic hash function Signed-off-by: Jason Cameron * test(hash): add xxhash benchmarks and collision tests Signed-off-by: Jason Cameron * Update metadata check-spelling run (pull_request) for json/hash Signed-off-by: check-spelling-bot on-behalf-of: @check-spelling --------- Signed-off-by: Jason Cameron Signed-off-by: check-spelling-bot --- .github/actions/spelling/expect.txt | 4 +- docs/docs/CHANGELOG.md | 1 + go.mod | 6 +- internal/hash.go | 13 ++ internal/hash_bench_test.go | 261 ++++++++++++++++++++++++++++ internal/thoth/asnchecker.go | 2 +- lib/anubis.go | 2 +- lib/policy/bot.go | 2 +- lib/policy/celchecker.go | 2 +- lib/policy/checker.go | 12 +- lib/policy/checker/checker.go | 2 +- 11 files changed, 291 insertions(+), 16 deletions(-) create mode 100644 internal/hash_bench_test.go diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt index 00e5fd8..ab110f0 100644 --- a/.github/actions/spelling/expect.txt +++ b/.github/actions/spelling/expect.txt @@ -36,6 +36,7 @@ celchecker CELPHASE cerr certresolver +cespare CGNAT cgr chainguard @@ -186,7 +187,6 @@ OCOB ogtags omgili omgilibot -onionservice openai openrc pag @@ -214,6 +214,7 @@ qualys qwant qwantbot rac +rawler rcvar redir redirectscheme @@ -264,7 +265,6 @@ thoth thothmock Tik Timpibot -torproject traefik uberspace unixhttpd diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md index 7662ee4..79f55d0 100644 --- a/docs/docs/CHANGELOG.md +++ b/docs/docs/CHANGELOG.md @@ -24,6 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add `--strip-base-prefix` flag/envvar to strip the base prefix from request paths when forwarding to target servers - Add `robots2policy` CLI utility to convert robots.txt files to Anubis challenge policies using CEL expressions ([#409](https://github.com/TecharoHQ/anubis/issues/409)) - Implement GeoIP and ASN based checks via [Thoth](https://anubis.techaro.lol/docs/admin/thoth) ([#206](https://github.com/TecharoHQ/anubis/issues/206)) +- Replace internal SHA256 hashing with xxhash for 4-6x performance improvement in policy evaluation and cache operations ## v1.19.1: Jenomis cen Lexentale - Echo 1 diff --git a/go.mod b/go.mod index 9c60fda..b6271e4 100644 --- a/go.mod +++ b/go.mod @@ -5,19 +5,21 @@ go 1.24.2 require ( github.com/TecharoHQ/thoth-proto v0.4.0 github.com/a-h/templ v0.3.898 + github.com/cespare/xxhash/v2 v2.3.0 github.com/facebookgo/flagenv v0.0.0-20160425205200-fcd59fca7456 github.com/gaissmai/bart v0.20.4 github.com/golang-jwt/jwt/v5 v5.2.2 github.com/google/cel-go v0.25.0 github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus v1.0.1 + github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.1.0 github.com/joho/godotenv v1.5.1 github.com/playwright-community/playwright-go v0.5200.0 github.com/prometheus/client_golang v1.22.0 github.com/sebest/xff v0.0.0-20210106013422-671bd2870b3a github.com/yl2chen/cidranger v1.0.2 golang.org/x/net v0.41.0 - gopkg.in/yaml.v3 v3.0.1 google.golang.org/grpc v1.72.2 + gopkg.in/yaml.v3 v3.0.1 k8s.io/apimachinery v0.33.1 sigs.k8s.io/yaml v1.4.0 ) @@ -43,7 +45,6 @@ require ( github.com/blakesmith/ar v0.0.0-20190502131153-809d4375e1fb // indirect github.com/cavaliergopher/cpio v1.0.1 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect - github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cli/browser v1.3.0 // indirect github.com/cli/go-gh v0.1.0 // indirect github.com/cloudflare/circl v1.6.1 // indirect @@ -72,7 +73,6 @@ require ( github.com/goreleaser/chglog v0.7.0 // indirect github.com/goreleaser/fileglob v1.3.0 // indirect github.com/goreleaser/nfpm/v2 v2.42.1 // indirect - github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.1.0 // indirect github.com/huandu/xstrings v1.5.0 // indirect github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 // indirect github.com/kevinburke/ssh_config v1.2.0 // indirect diff --git a/internal/hash.go b/internal/hash.go index 818ad55..aeec173 100644 --- a/internal/hash.go +++ b/internal/hash.go @@ -3,10 +3,23 @@ package internal import ( "crypto/sha256" "encoding/hex" + "strconv" + + "github.com/cespare/xxhash/v2" ) +// SHA256sum computes a cryptographic hash. Still used for proof-of-work challenges +// where we need the security properties of a cryptographic hash function. func SHA256sum(text string) string { hash := sha256.New() hash.Write([]byte(text)) return hex.EncodeToString(hash.Sum(nil)) } + +// FastHash is a high-performance non-cryptographic hash function suitable for +// internal caching, policy rule identification, and other performance-critical +// use cases where cryptographic security is not required. +func FastHash(text string) string { + h := xxhash.Sum64String(text) + return strconv.FormatUint(h, 16) +} diff --git a/internal/hash_bench_test.go b/internal/hash_bench_test.go new file mode 100644 index 0000000..5384570 --- /dev/null +++ b/internal/hash_bench_test.go @@ -0,0 +1,261 @@ +package internal + +import ( + "fmt" + "strings" + "testing" +) + +// XXHash64sum is a test alias for FastHash to benchmark against SHA256 +func XXHash64sum(text string) string { + return FastHash(text) +} + +// Test data that matches real usage patterns in the codebase +var ( + // Typical policy checker inputs + policyInputs = []string{ + "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + "User-Agent: bot/1.0", + "User-Agent: GoogleBot/2.1", + "/robots.txt", + "/api/.*", + "10.0.0.0/8", + "192.168.1.0/24", + "172.16.0.0/12", + } + + // Challenge data from challengeFor function + challengeInputs = []string{ + "Accept-Language=en-US,X-Real-IP=192.168.1.100,User-Agent=Mozilla/5.0,WeekTime=2025-06-16T00:00:00Z,Fingerprint=abc123,Difficulty=5", + "Accept-Language=fr-FR,X-Real-IP=10.0.0.50,User-Agent=Chrome/91.0,WeekTime=2025-06-16T00:00:00Z,Fingerprint=def456,Difficulty=3", + "Accept-Language=es-ES,X-Real-IP=172.16.1.1,User-Agent=Safari/14.0,WeekTime=2025-06-16T00:00:00Z,Fingerprint=ghi789,Difficulty=7", + } + + // Bot rule patterns + botRuleInputs = []string{ + "GoogleBot::path:/robots.txt", + "BingBot::useragent:Mozilla/5.0 (compatible; bingbot/2.0)", + "FacebookBot::headers:Accept-Language,User-Agent", + "TwitterBot::cidr:192.168.1.0/24", + } + + // CEL expressions from policy rules + celInputs = []string{ + `request.headers["User-Agent"].contains("bot")`, + `request.path.startsWith("/api/") && request.method == "POST"`, + `request.remoteAddress in ["192.168.1.0/24", "10.0.0.0/8"]`, + `request.userAgent.matches(".*[Bb]ot.*") || request.userAgent.matches(".*[Cc]rawler.*")`, + } + + // Thoth ASN checker inputs + asnInputs = []string{ + "ASNChecker\nAS 15169\nAS 8075\nAS 32934", + "ASNChecker\nAS 13335\nAS 16509\nAS 14061", + "ASNChecker\nAS 36351\nAS 20940\nAS 8100", + } +) + +func BenchmarkSHA256_PolicyInputs(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + input := policyInputs[i%len(policyInputs)] + _ = SHA256sum(input) + } +} + +func BenchmarkXXHash_PolicyInputs(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + input := policyInputs[i%len(policyInputs)] + _ = XXHash64sum(input) + } +} + +func BenchmarkSHA256_ChallengeInputs(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + input := challengeInputs[i%len(challengeInputs)] + _ = SHA256sum(input) + } +} + +func BenchmarkXXHash_ChallengeInputs(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + input := challengeInputs[i%len(challengeInputs)] + _ = XXHash64sum(input) + } +} + +func BenchmarkSHA256_BotRuleInputs(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + input := botRuleInputs[i%len(botRuleInputs)] + _ = SHA256sum(input) + } +} + +func BenchmarkXXHash_BotRuleInputs(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + input := botRuleInputs[i%len(botRuleInputs)] + _ = XXHash64sum(input) + } +} + +func BenchmarkSHA256_CELInputs(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + input := celInputs[i%len(celInputs)] + _ = SHA256sum(input) + } +} + +func BenchmarkXXHash_CELInputs(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + input := celInputs[i%len(celInputs)] + _ = XXHash64sum(input) + } +} + +func BenchmarkSHA256_ASNInputs(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + input := asnInputs[i%len(asnInputs)] + _ = SHA256sum(input) + } +} + +func BenchmarkXXHash_ASNInputs(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + input := asnInputs[i%len(asnInputs)] + _ = XXHash64sum(input) + } +} + +// Benchmark the policy list hashing used in checker.go +func BenchmarkSHA256_PolicyList(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + var sb strings.Builder + for _, input := range policyInputs { + fmt.Fprintln(&sb, SHA256sum(input)) + } + _ = SHA256sum(sb.String()) + } +} + +func BenchmarkXXHash_PolicyList(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + var sb strings.Builder + for _, input := range policyInputs { + fmt.Fprintln(&sb, XXHash64sum(input)) + } + _ = XXHash64sum(sb.String()) + } +} + +// Tests that xxhash doesn't have collisions in realistic scenarios +func TestHashCollisions(t *testing.T) { + allInputs := append(append(append(append(policyInputs, challengeInputs...), botRuleInputs...), celInputs...), asnInputs...) + + // Start with realistic inputs from actual usage + xxhashHashes := make(map[string]string) + for _, input := range allInputs { + hash := XXHash64sum(input) + if existing, exists := xxhashHashes[hash]; exists { + t.Errorf("XXHash collision detected: %q and %q both hash to %s", input, existing, hash) + } + xxhashHashes[hash] = input + } + + t.Logf("Basic test: %d realistic inputs, no collisions", len(allInputs)) + + // Test similar strings that might cause hash collisions + prefixes := []string{"User-Agent: ", "X-Real-IP: ", "Accept-Language: ", "Host: "} + suffixes := []string{"bot", "crawler", "spider", "scraper", "Mozilla", "Chrome", "Safari", "Firefox"} + variations := []string{"", "/1.0", "/2.0", " (compatible)", " (Windows)", " (Linux)", " (Mac)"} + + stressCount := 0 + for _, prefix := range prefixes { + for _, suffix := range suffixes { + for _, variation := range variations { + for i := 0; i < 100; i++ { + input := fmt.Sprintf("%s%s%s-%d", prefix, suffix, variation, i) + hash := XXHash64sum(input) + if existing, exists := xxhashHashes[hash]; exists { + t.Errorf("XXHash collision in stress test: %q and %q both hash to %s", input, existing, hash) + } + xxhashHashes[hash] = input + stressCount++ + } + } + } + } + t.Logf("Stress test 1: %d similar string variations, no collisions", stressCount) + + // Test sequential patterns that might be problematic + patterns := []string{ + "192.168.1.%d", + "10.0.0.%d", + "172.16.%d.1", + "challenge-%d", + "bot-rule-%d", + "policy-%016x", + "session-%016x", + } + + seqCount := 0 + for _, pattern := range patterns { + for i := 0; i < 10000; i++ { + input := fmt.Sprintf(pattern, i) + hash := XXHash64sum(input) + if existing, exists := xxhashHashes[hash]; exists { + t.Errorf("XXHash collision in sequential test: %q and %q both hash to %s", input, existing, hash) + } + xxhashHashes[hash] = input + seqCount++ + } + } + t.Logf("Stress test 2: %d sequential patterns, no collisions", seqCount) + + totalInputs := len(allInputs) + stressCount + seqCount + t.Logf("TOTAL: Tested %d inputs across realistic scenarios - NO COLLISIONS", totalInputs) +} + +// Verify xxhash output works as cache keys +func TestXXHashFormat(t *testing.T) { + testCases := []string{ + "short", + "", + "very long string with lots of content that might be used in policy checking and other internal hashing scenarios", + "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", + } + + for _, input := range testCases { + hash := XXHash64sum(input) + + // Check it's valid hex + if len(hash) == 0 { + t.Errorf("Empty hash for input %q", input) + } + + // xxhash is 64-bit so max 16 hex chars + if len(hash) > 16 { + t.Errorf("Hash too long for input %q: %s (length %d)", input, hash, len(hash)) + } + + // Make sure it's all hex characters + for _, char := range hash { + if !((char >= '0' && char <= '9') || (char >= 'a' && char <= 'f')) { + t.Errorf("Non-hex character %c in hash %s for input %q", char, hash, input) + } + } + + t.Logf("Input: %q -> Hash: %s", input, hash) + } +} diff --git a/internal/thoth/asnchecker.go b/internal/thoth/asnchecker.go index b2697c8..548765c 100644 --- a/internal/thoth/asnchecker.go +++ b/internal/thoth/asnchecker.go @@ -26,7 +26,7 @@ func (c *Client) ASNCheckerFor(asns []uint32) checker.Impl { return &ASNChecker{ iptoasn: c.IPToASN, asns: asnMap, - hash: internal.SHA256sum(sb.String()), + hash: internal.FastHash(sb.String()), } } diff --git a/lib/anubis.go b/lib/anubis.go index f700281..c8945fb 100644 --- a/lib/anubis.go +++ b/lib/anubis.go @@ -90,7 +90,7 @@ func (s *Server) challengeFor(r *http.Request, difficulty int) string { fp, difficulty, ) - return internal.SHA256sum(challengeData) + return internal.FastHash(challengeData) } func (s *Server) maybeReverseProxyHttpStatusOnly(w http.ResponseWriter, r *http.Request) { diff --git a/lib/policy/bot.go b/lib/policy/bot.go index ba884d6..479bccc 100644 --- a/lib/policy/bot.go +++ b/lib/policy/bot.go @@ -17,5 +17,5 @@ type Bot struct { } func (b Bot) Hash() string { - return internal.SHA256sum(fmt.Sprintf("%s::%s", b.Name, b.Rules.Hash())) + return internal.FastHash(fmt.Sprintf("%s::%s", b.Name, b.Rules.Hash())) } diff --git a/lib/policy/celchecker.go b/lib/policy/celchecker.go index 1269fca..b19ea60 100644 --- a/lib/policy/celchecker.go +++ b/lib/policy/celchecker.go @@ -63,7 +63,7 @@ func NewCELChecker(cfg *config.ExpressionOrList) (*CELChecker, error) { } func (cc *CELChecker) Hash() string { - return internal.SHA256sum(cc.src) + return internal.FastHash(cc.src) } func (cc *CELChecker) Check(r *http.Request) (bool, error) { diff --git a/lib/policy/checker.go b/lib/policy/checker.go index 0096578..33b58d4 100644 --- a/lib/policy/checker.go +++ b/lib/policy/checker.go @@ -28,7 +28,7 @@ func (staticHashChecker) Check(r *http.Request) (bool, error) { func (s staticHashChecker) Hash() string { return s.hash } func NewStaticHashChecker(hashable string) checker.Impl { - return staticHashChecker{hash: internal.SHA256sum(hashable)} + return staticHashChecker{hash: internal.FastHash(hashable)} } type RemoteAddrChecker struct { @@ -55,7 +55,7 @@ func NewRemoteAddrChecker(cidrs []string) (checker.Impl, error) { return &RemoteAddrChecker{ ranger: ranger, - hash: internal.SHA256sum(sb.String()), + hash: internal.FastHash(sb.String()), }, nil } @@ -101,7 +101,7 @@ func NewHeaderMatchesChecker(header, rexStr string) (checker.Impl, error) { if err != nil { return nil, fmt.Errorf("%w: regex %s failed parse: %w", ErrMisconfiguration, rexStr, err) } - return &HeaderMatchesChecker{strings.TrimSpace(header), rex, internal.SHA256sum(header + ": " + rexStr)}, nil + return &HeaderMatchesChecker{strings.TrimSpace(header), rex, internal.FastHash(header + ": " + rexStr)}, nil } func (hmc *HeaderMatchesChecker) Check(r *http.Request) (bool, error) { @@ -126,7 +126,7 @@ func NewPathChecker(rexStr string) (checker.Impl, error) { if err != nil { return nil, fmt.Errorf("%w: regex %s failed parse: %w", ErrMisconfiguration, rexStr, err) } - return &PathChecker{rex, internal.SHA256sum(rexStr)}, nil + return &PathChecker{rex, internal.FastHash(rexStr)}, nil } func (pc *PathChecker) Check(r *http.Request) (bool, error) { @@ -158,7 +158,7 @@ func (hec headerExistsChecker) Check(r *http.Request) (bool, error) { } func (hec headerExistsChecker) Hash() string { - return internal.SHA256sum(hec.header) + return internal.FastHash(hec.header) } func NewHeadersChecker(headermap map[string]string) (checker.Impl, error) { @@ -177,7 +177,7 @@ func NewHeadersChecker(headermap map[string]string) (checker.Impl, error) { continue } - result = append(result, &HeaderMatchesChecker{key, rex, internal.SHA256sum(key + ": " + rexStr)}) + result = append(result, &HeaderMatchesChecker{key, rex, internal.FastHash(key + ": " + rexStr)}) } if len(errs) != 0 { diff --git a/lib/policy/checker/checker.go b/lib/policy/checker/checker.go index 4d7b5c7..1ee276a 100644 --- a/lib/policy/checker/checker.go +++ b/lib/policy/checker/checker.go @@ -37,5 +37,5 @@ func (l List) Hash() string { fmt.Fprintln(&sb, c.Hash()) } - return internal.SHA256sum(sb.String()) + return internal.FastHash(sb.String()) }