perf: Replace internal SHA256 hashing with xxhash for 4-6x performance improvement (#676)

* perf(internal): Use FastHash for internal hashing
docs: Add xxhash performance improvement to changelog entry
feat(hash): Add fast non-cryptographic hash function

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* test(hash): add xxhash benchmarks and collision tests

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* Update metadata

check-spelling run (pull_request) for json/hash

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
on-behalf-of: @check-spelling <check-spelling-bot@check-spelling.dev>

---------

Signed-off-by: Jason Cameron <git@jasoncameron.dev>
Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
This commit is contained in:
Jason Cameron 2025-06-16 22:53:53 -04:00 committed by GitHub
parent 3437e575d4
commit e2b46fc5e7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 291 additions and 16 deletions

View File

@ -36,6 +36,7 @@ celchecker
CELPHASE CELPHASE
cerr cerr
certresolver certresolver
cespare
CGNAT CGNAT
cgr cgr
chainguard chainguard
@ -186,7 +187,6 @@ OCOB
ogtags ogtags
omgili omgili
omgilibot omgilibot
onionservice
openai openai
openrc openrc
pag pag
@ -214,6 +214,7 @@ qualys
qwant qwant
qwantbot qwantbot
rac rac
rawler
rcvar rcvar
redir redir
redirectscheme redirectscheme
@ -264,7 +265,6 @@ thoth
thothmock thothmock
Tik Tik
Timpibot Timpibot
torproject
traefik traefik
uberspace uberspace
unixhttpd unixhttpd

View File

@ -24,6 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add `--strip-base-prefix` flag/envvar to strip the base prefix from request paths when forwarding to target servers - Add `--strip-base-prefix` flag/envvar to strip the base prefix from request paths when forwarding to target servers
- Add `robots2policy` CLI utility to convert robots.txt files to Anubis challenge policies using CEL expressions ([#409](https://github.com/TecharoHQ/anubis/issues/409)) - Add `robots2policy` CLI utility to convert robots.txt files to Anubis challenge policies using CEL expressions ([#409](https://github.com/TecharoHQ/anubis/issues/409))
- Implement GeoIP and ASN based checks via [Thoth](https://anubis.techaro.lol/docs/admin/thoth) ([#206](https://github.com/TecharoHQ/anubis/issues/206)) - Implement GeoIP and ASN based checks via [Thoth](https://anubis.techaro.lol/docs/admin/thoth) ([#206](https://github.com/TecharoHQ/anubis/issues/206))
- Replace internal SHA256 hashing with xxhash for 4-6x performance improvement in policy evaluation and cache operations
## v1.19.1: Jenomis cen Lexentale - Echo 1 ## v1.19.1: Jenomis cen Lexentale - Echo 1

6
go.mod
View File

@ -5,19 +5,21 @@ go 1.24.2
require ( require (
github.com/TecharoHQ/thoth-proto v0.4.0 github.com/TecharoHQ/thoth-proto v0.4.0
github.com/a-h/templ v0.3.898 github.com/a-h/templ v0.3.898
github.com/cespare/xxhash/v2 v2.3.0
github.com/facebookgo/flagenv v0.0.0-20160425205200-fcd59fca7456 github.com/facebookgo/flagenv v0.0.0-20160425205200-fcd59fca7456
github.com/gaissmai/bart v0.20.4 github.com/gaissmai/bart v0.20.4
github.com/golang-jwt/jwt/v5 v5.2.2 github.com/golang-jwt/jwt/v5 v5.2.2
github.com/google/cel-go v0.25.0 github.com/google/cel-go v0.25.0
github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus v1.0.1 github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus v1.0.1
github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.1.0
github.com/joho/godotenv v1.5.1 github.com/joho/godotenv v1.5.1
github.com/playwright-community/playwright-go v0.5200.0 github.com/playwright-community/playwright-go v0.5200.0
github.com/prometheus/client_golang v1.22.0 github.com/prometheus/client_golang v1.22.0
github.com/sebest/xff v0.0.0-20210106013422-671bd2870b3a github.com/sebest/xff v0.0.0-20210106013422-671bd2870b3a
github.com/yl2chen/cidranger v1.0.2 github.com/yl2chen/cidranger v1.0.2
golang.org/x/net v0.41.0 golang.org/x/net v0.41.0
gopkg.in/yaml.v3 v3.0.1
google.golang.org/grpc v1.72.2 google.golang.org/grpc v1.72.2
gopkg.in/yaml.v3 v3.0.1
k8s.io/apimachinery v0.33.1 k8s.io/apimachinery v0.33.1
sigs.k8s.io/yaml v1.4.0 sigs.k8s.io/yaml v1.4.0
) )
@ -43,7 +45,6 @@ require (
github.com/blakesmith/ar v0.0.0-20190502131153-809d4375e1fb // indirect github.com/blakesmith/ar v0.0.0-20190502131153-809d4375e1fb // indirect
github.com/cavaliergopher/cpio v1.0.1 // indirect github.com/cavaliergopher/cpio v1.0.1 // indirect
github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/cli/browser v1.3.0 // indirect github.com/cli/browser v1.3.0 // indirect
github.com/cli/go-gh v0.1.0 // indirect github.com/cli/go-gh v0.1.0 // indirect
github.com/cloudflare/circl v1.6.1 // indirect github.com/cloudflare/circl v1.6.1 // indirect
@ -72,7 +73,6 @@ require (
github.com/goreleaser/chglog v0.7.0 // indirect github.com/goreleaser/chglog v0.7.0 // indirect
github.com/goreleaser/fileglob v1.3.0 // indirect github.com/goreleaser/fileglob v1.3.0 // indirect
github.com/goreleaser/nfpm/v2 v2.42.1 // indirect github.com/goreleaser/nfpm/v2 v2.42.1 // indirect
github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.1.0 // indirect
github.com/huandu/xstrings v1.5.0 // indirect github.com/huandu/xstrings v1.5.0 // indirect
github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 // indirect github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 // indirect
github.com/kevinburke/ssh_config v1.2.0 // indirect github.com/kevinburke/ssh_config v1.2.0 // indirect

View File

@ -3,10 +3,23 @@ package internal
import ( import (
"crypto/sha256" "crypto/sha256"
"encoding/hex" "encoding/hex"
"strconv"
"github.com/cespare/xxhash/v2"
) )
// SHA256sum computes a cryptographic hash. Still used for proof-of-work challenges
// where we need the security properties of a cryptographic hash function.
func SHA256sum(text string) string { func SHA256sum(text string) string {
hash := sha256.New() hash := sha256.New()
hash.Write([]byte(text)) hash.Write([]byte(text))
return hex.EncodeToString(hash.Sum(nil)) return hex.EncodeToString(hash.Sum(nil))
} }
// FastHash is a high-performance non-cryptographic hash function suitable for
// internal caching, policy rule identification, and other performance-critical
// use cases where cryptographic security is not required.
func FastHash(text string) string {
h := xxhash.Sum64String(text)
return strconv.FormatUint(h, 16)
}

261
internal/hash_bench_test.go Normal file
View File

@ -0,0 +1,261 @@
package internal
import (
"fmt"
"strings"
"testing"
)
// XXHash64sum is a test alias for FastHash to benchmark against SHA256
func XXHash64sum(text string) string {
return FastHash(text)
}
// Test data that matches real usage patterns in the codebase
var (
// Typical policy checker inputs
policyInputs = []string{
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"User-Agent: bot/1.0",
"User-Agent: GoogleBot/2.1",
"/robots.txt",
"/api/.*",
"10.0.0.0/8",
"192.168.1.0/24",
"172.16.0.0/12",
}
// Challenge data from challengeFor function
challengeInputs = []string{
"Accept-Language=en-US,X-Real-IP=192.168.1.100,User-Agent=Mozilla/5.0,WeekTime=2025-06-16T00:00:00Z,Fingerprint=abc123,Difficulty=5",
"Accept-Language=fr-FR,X-Real-IP=10.0.0.50,User-Agent=Chrome/91.0,WeekTime=2025-06-16T00:00:00Z,Fingerprint=def456,Difficulty=3",
"Accept-Language=es-ES,X-Real-IP=172.16.1.1,User-Agent=Safari/14.0,WeekTime=2025-06-16T00:00:00Z,Fingerprint=ghi789,Difficulty=7",
}
// Bot rule patterns
botRuleInputs = []string{
"GoogleBot::path:/robots.txt",
"BingBot::useragent:Mozilla/5.0 (compatible; bingbot/2.0)",
"FacebookBot::headers:Accept-Language,User-Agent",
"TwitterBot::cidr:192.168.1.0/24",
}
// CEL expressions from policy rules
celInputs = []string{
`request.headers["User-Agent"].contains("bot")`,
`request.path.startsWith("/api/") && request.method == "POST"`,
`request.remoteAddress in ["192.168.1.0/24", "10.0.0.0/8"]`,
`request.userAgent.matches(".*[Bb]ot.*") || request.userAgent.matches(".*[Cc]rawler.*")`,
}
// Thoth ASN checker inputs
asnInputs = []string{
"ASNChecker\nAS 15169\nAS 8075\nAS 32934",
"ASNChecker\nAS 13335\nAS 16509\nAS 14061",
"ASNChecker\nAS 36351\nAS 20940\nAS 8100",
}
)
func BenchmarkSHA256_PolicyInputs(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
input := policyInputs[i%len(policyInputs)]
_ = SHA256sum(input)
}
}
func BenchmarkXXHash_PolicyInputs(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
input := policyInputs[i%len(policyInputs)]
_ = XXHash64sum(input)
}
}
func BenchmarkSHA256_ChallengeInputs(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
input := challengeInputs[i%len(challengeInputs)]
_ = SHA256sum(input)
}
}
func BenchmarkXXHash_ChallengeInputs(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
input := challengeInputs[i%len(challengeInputs)]
_ = XXHash64sum(input)
}
}
func BenchmarkSHA256_BotRuleInputs(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
input := botRuleInputs[i%len(botRuleInputs)]
_ = SHA256sum(input)
}
}
func BenchmarkXXHash_BotRuleInputs(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
input := botRuleInputs[i%len(botRuleInputs)]
_ = XXHash64sum(input)
}
}
func BenchmarkSHA256_CELInputs(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
input := celInputs[i%len(celInputs)]
_ = SHA256sum(input)
}
}
func BenchmarkXXHash_CELInputs(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
input := celInputs[i%len(celInputs)]
_ = XXHash64sum(input)
}
}
func BenchmarkSHA256_ASNInputs(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
input := asnInputs[i%len(asnInputs)]
_ = SHA256sum(input)
}
}
func BenchmarkXXHash_ASNInputs(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
input := asnInputs[i%len(asnInputs)]
_ = XXHash64sum(input)
}
}
// Benchmark the policy list hashing used in checker.go
func BenchmarkSHA256_PolicyList(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
var sb strings.Builder
for _, input := range policyInputs {
fmt.Fprintln(&sb, SHA256sum(input))
}
_ = SHA256sum(sb.String())
}
}
func BenchmarkXXHash_PolicyList(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
var sb strings.Builder
for _, input := range policyInputs {
fmt.Fprintln(&sb, XXHash64sum(input))
}
_ = XXHash64sum(sb.String())
}
}
// Tests that xxhash doesn't have collisions in realistic scenarios
func TestHashCollisions(t *testing.T) {
allInputs := append(append(append(append(policyInputs, challengeInputs...), botRuleInputs...), celInputs...), asnInputs...)
// Start with realistic inputs from actual usage
xxhashHashes := make(map[string]string)
for _, input := range allInputs {
hash := XXHash64sum(input)
if existing, exists := xxhashHashes[hash]; exists {
t.Errorf("XXHash collision detected: %q and %q both hash to %s", input, existing, hash)
}
xxhashHashes[hash] = input
}
t.Logf("Basic test: %d realistic inputs, no collisions", len(allInputs))
// Test similar strings that might cause hash collisions
prefixes := []string{"User-Agent: ", "X-Real-IP: ", "Accept-Language: ", "Host: "}
suffixes := []string{"bot", "crawler", "spider", "scraper", "Mozilla", "Chrome", "Safari", "Firefox"}
variations := []string{"", "/1.0", "/2.0", " (compatible)", " (Windows)", " (Linux)", " (Mac)"}
stressCount := 0
for _, prefix := range prefixes {
for _, suffix := range suffixes {
for _, variation := range variations {
for i := 0; i < 100; i++ {
input := fmt.Sprintf("%s%s%s-%d", prefix, suffix, variation, i)
hash := XXHash64sum(input)
if existing, exists := xxhashHashes[hash]; exists {
t.Errorf("XXHash collision in stress test: %q and %q both hash to %s", input, existing, hash)
}
xxhashHashes[hash] = input
stressCount++
}
}
}
}
t.Logf("Stress test 1: %d similar string variations, no collisions", stressCount)
// Test sequential patterns that might be problematic
patterns := []string{
"192.168.1.%d",
"10.0.0.%d",
"172.16.%d.1",
"challenge-%d",
"bot-rule-%d",
"policy-%016x",
"session-%016x",
}
seqCount := 0
for _, pattern := range patterns {
for i := 0; i < 10000; i++ {
input := fmt.Sprintf(pattern, i)
hash := XXHash64sum(input)
if existing, exists := xxhashHashes[hash]; exists {
t.Errorf("XXHash collision in sequential test: %q and %q both hash to %s", input, existing, hash)
}
xxhashHashes[hash] = input
seqCount++
}
}
t.Logf("Stress test 2: %d sequential patterns, no collisions", seqCount)
totalInputs := len(allInputs) + stressCount + seqCount
t.Logf("TOTAL: Tested %d inputs across realistic scenarios - NO COLLISIONS", totalInputs)
}
// Verify xxhash output works as cache keys
func TestXXHashFormat(t *testing.T) {
testCases := []string{
"short",
"",
"very long string with lots of content that might be used in policy checking and other internal hashing scenarios",
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
}
for _, input := range testCases {
hash := XXHash64sum(input)
// Check it's valid hex
if len(hash) == 0 {
t.Errorf("Empty hash for input %q", input)
}
// xxhash is 64-bit so max 16 hex chars
if len(hash) > 16 {
t.Errorf("Hash too long for input %q: %s (length %d)", input, hash, len(hash))
}
// Make sure it's all hex characters
for _, char := range hash {
if !((char >= '0' && char <= '9') || (char >= 'a' && char <= 'f')) {
t.Errorf("Non-hex character %c in hash %s for input %q", char, hash, input)
}
}
t.Logf("Input: %q -> Hash: %s", input, hash)
}
}

View File

@ -26,7 +26,7 @@ func (c *Client) ASNCheckerFor(asns []uint32) checker.Impl {
return &ASNChecker{ return &ASNChecker{
iptoasn: c.IPToASN, iptoasn: c.IPToASN,
asns: asnMap, asns: asnMap,
hash: internal.SHA256sum(sb.String()), hash: internal.FastHash(sb.String()),
} }
} }

View File

@ -90,7 +90,7 @@ func (s *Server) challengeFor(r *http.Request, difficulty int) string {
fp, fp,
difficulty, difficulty,
) )
return internal.SHA256sum(challengeData) return internal.FastHash(challengeData)
} }
func (s *Server) maybeReverseProxyHttpStatusOnly(w http.ResponseWriter, r *http.Request) { func (s *Server) maybeReverseProxyHttpStatusOnly(w http.ResponseWriter, r *http.Request) {

View File

@ -17,5 +17,5 @@ type Bot struct {
} }
func (b Bot) Hash() string { func (b Bot) Hash() string {
return internal.SHA256sum(fmt.Sprintf("%s::%s", b.Name, b.Rules.Hash())) return internal.FastHash(fmt.Sprintf("%s::%s", b.Name, b.Rules.Hash()))
} }

View File

@ -63,7 +63,7 @@ func NewCELChecker(cfg *config.ExpressionOrList) (*CELChecker, error) {
} }
func (cc *CELChecker) Hash() string { func (cc *CELChecker) Hash() string {
return internal.SHA256sum(cc.src) return internal.FastHash(cc.src)
} }
func (cc *CELChecker) Check(r *http.Request) (bool, error) { func (cc *CELChecker) Check(r *http.Request) (bool, error) {

View File

@ -28,7 +28,7 @@ func (staticHashChecker) Check(r *http.Request) (bool, error) {
func (s staticHashChecker) Hash() string { return s.hash } func (s staticHashChecker) Hash() string { return s.hash }
func NewStaticHashChecker(hashable string) checker.Impl { func NewStaticHashChecker(hashable string) checker.Impl {
return staticHashChecker{hash: internal.SHA256sum(hashable)} return staticHashChecker{hash: internal.FastHash(hashable)}
} }
type RemoteAddrChecker struct { type RemoteAddrChecker struct {
@ -55,7 +55,7 @@ func NewRemoteAddrChecker(cidrs []string) (checker.Impl, error) {
return &RemoteAddrChecker{ return &RemoteAddrChecker{
ranger: ranger, ranger: ranger,
hash: internal.SHA256sum(sb.String()), hash: internal.FastHash(sb.String()),
}, nil }, nil
} }
@ -101,7 +101,7 @@ func NewHeaderMatchesChecker(header, rexStr string) (checker.Impl, error) {
if err != nil { if err != nil {
return nil, fmt.Errorf("%w: regex %s failed parse: %w", ErrMisconfiguration, rexStr, err) return nil, fmt.Errorf("%w: regex %s failed parse: %w", ErrMisconfiguration, rexStr, err)
} }
return &HeaderMatchesChecker{strings.TrimSpace(header), rex, internal.SHA256sum(header + ": " + rexStr)}, nil return &HeaderMatchesChecker{strings.TrimSpace(header), rex, internal.FastHash(header + ": " + rexStr)}, nil
} }
func (hmc *HeaderMatchesChecker) Check(r *http.Request) (bool, error) { func (hmc *HeaderMatchesChecker) Check(r *http.Request) (bool, error) {
@ -126,7 +126,7 @@ func NewPathChecker(rexStr string) (checker.Impl, error) {
if err != nil { if err != nil {
return nil, fmt.Errorf("%w: regex %s failed parse: %w", ErrMisconfiguration, rexStr, err) return nil, fmt.Errorf("%w: regex %s failed parse: %w", ErrMisconfiguration, rexStr, err)
} }
return &PathChecker{rex, internal.SHA256sum(rexStr)}, nil return &PathChecker{rex, internal.FastHash(rexStr)}, nil
} }
func (pc *PathChecker) Check(r *http.Request) (bool, error) { func (pc *PathChecker) Check(r *http.Request) (bool, error) {
@ -158,7 +158,7 @@ func (hec headerExistsChecker) Check(r *http.Request) (bool, error) {
} }
func (hec headerExistsChecker) Hash() string { func (hec headerExistsChecker) Hash() string {
return internal.SHA256sum(hec.header) return internal.FastHash(hec.header)
} }
func NewHeadersChecker(headermap map[string]string) (checker.Impl, error) { func NewHeadersChecker(headermap map[string]string) (checker.Impl, error) {
@ -177,7 +177,7 @@ func NewHeadersChecker(headermap map[string]string) (checker.Impl, error) {
continue continue
} }
result = append(result, &HeaderMatchesChecker{key, rex, internal.SHA256sum(key + ": " + rexStr)}) result = append(result, &HeaderMatchesChecker{key, rex, internal.FastHash(key + ": " + rexStr)})
} }
if len(errs) != 0 { if len(errs) != 0 {

View File

@ -37,5 +37,5 @@ func (l List) Hash() string {
fmt.Fprintln(&sb, c.Hash()) fmt.Fprintln(&sb, c.Hash())
} }
return internal.SHA256sum(sb.String()) return internal.FastHash(sb.String())
} }