anubis/internal/hash_bench_test.go
Jason Cameron e2b46fc5e7
perf: Replace internal SHA256 hashing with xxhash for 4-6x performance improvement (#676)
* perf(internal): Use FastHash for internal hashing
docs: Add xxhash performance improvement to changelog entry
feat(hash): Add fast non-cryptographic hash function

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* test(hash): add xxhash benchmarks and collision tests

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* Update metadata

check-spelling run (pull_request) for json/hash

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
on-behalf-of: @check-spelling <check-spelling-bot@check-spelling.dev>

---------

Signed-off-by: Jason Cameron <git@jasoncameron.dev>
Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
2025-06-16 22:53:53 -04:00

262 lines
7.2 KiB
Go

package internal
import (
"fmt"
"strings"
"testing"
)
// XXHash64sum is a test alias for FastHash to benchmark against SHA256
func XXHash64sum(text string) string {
return FastHash(text)
}
// Test data that matches real usage patterns in the codebase
var (
// Typical policy checker inputs
policyInputs = []string{
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"User-Agent: bot/1.0",
"User-Agent: GoogleBot/2.1",
"/robots.txt",
"/api/.*",
"10.0.0.0/8",
"192.168.1.0/24",
"172.16.0.0/12",
}
// Challenge data from challengeFor function
challengeInputs = []string{
"Accept-Language=en-US,X-Real-IP=192.168.1.100,User-Agent=Mozilla/5.0,WeekTime=2025-06-16T00:00:00Z,Fingerprint=abc123,Difficulty=5",
"Accept-Language=fr-FR,X-Real-IP=10.0.0.50,User-Agent=Chrome/91.0,WeekTime=2025-06-16T00:00:00Z,Fingerprint=def456,Difficulty=3",
"Accept-Language=es-ES,X-Real-IP=172.16.1.1,User-Agent=Safari/14.0,WeekTime=2025-06-16T00:00:00Z,Fingerprint=ghi789,Difficulty=7",
}
// Bot rule patterns
botRuleInputs = []string{
"GoogleBot::path:/robots.txt",
"BingBot::useragent:Mozilla/5.0 (compatible; bingbot/2.0)",
"FacebookBot::headers:Accept-Language,User-Agent",
"TwitterBot::cidr:192.168.1.0/24",
}
// CEL expressions from policy rules
celInputs = []string{
`request.headers["User-Agent"].contains("bot")`,
`request.path.startsWith("/api/") && request.method == "POST"`,
`request.remoteAddress in ["192.168.1.0/24", "10.0.0.0/8"]`,
`request.userAgent.matches(".*[Bb]ot.*") || request.userAgent.matches(".*[Cc]rawler.*")`,
}
// Thoth ASN checker inputs
asnInputs = []string{
"ASNChecker\nAS 15169\nAS 8075\nAS 32934",
"ASNChecker\nAS 13335\nAS 16509\nAS 14061",
"ASNChecker\nAS 36351\nAS 20940\nAS 8100",
}
)
func BenchmarkSHA256_PolicyInputs(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
input := policyInputs[i%len(policyInputs)]
_ = SHA256sum(input)
}
}
func BenchmarkXXHash_PolicyInputs(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
input := policyInputs[i%len(policyInputs)]
_ = XXHash64sum(input)
}
}
func BenchmarkSHA256_ChallengeInputs(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
input := challengeInputs[i%len(challengeInputs)]
_ = SHA256sum(input)
}
}
func BenchmarkXXHash_ChallengeInputs(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
input := challengeInputs[i%len(challengeInputs)]
_ = XXHash64sum(input)
}
}
func BenchmarkSHA256_BotRuleInputs(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
input := botRuleInputs[i%len(botRuleInputs)]
_ = SHA256sum(input)
}
}
func BenchmarkXXHash_BotRuleInputs(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
input := botRuleInputs[i%len(botRuleInputs)]
_ = XXHash64sum(input)
}
}
func BenchmarkSHA256_CELInputs(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
input := celInputs[i%len(celInputs)]
_ = SHA256sum(input)
}
}
func BenchmarkXXHash_CELInputs(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
input := celInputs[i%len(celInputs)]
_ = XXHash64sum(input)
}
}
func BenchmarkSHA256_ASNInputs(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
input := asnInputs[i%len(asnInputs)]
_ = SHA256sum(input)
}
}
func BenchmarkXXHash_ASNInputs(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
input := asnInputs[i%len(asnInputs)]
_ = XXHash64sum(input)
}
}
// Benchmark the policy list hashing used in checker.go
func BenchmarkSHA256_PolicyList(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
var sb strings.Builder
for _, input := range policyInputs {
fmt.Fprintln(&sb, SHA256sum(input))
}
_ = SHA256sum(sb.String())
}
}
func BenchmarkXXHash_PolicyList(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
var sb strings.Builder
for _, input := range policyInputs {
fmt.Fprintln(&sb, XXHash64sum(input))
}
_ = XXHash64sum(sb.String())
}
}
// Tests that xxhash doesn't have collisions in realistic scenarios
func TestHashCollisions(t *testing.T) {
allInputs := append(append(append(append(policyInputs, challengeInputs...), botRuleInputs...), celInputs...), asnInputs...)
// Start with realistic inputs from actual usage
xxhashHashes := make(map[string]string)
for _, input := range allInputs {
hash := XXHash64sum(input)
if existing, exists := xxhashHashes[hash]; exists {
t.Errorf("XXHash collision detected: %q and %q both hash to %s", input, existing, hash)
}
xxhashHashes[hash] = input
}
t.Logf("Basic test: %d realistic inputs, no collisions", len(allInputs))
// Test similar strings that might cause hash collisions
prefixes := []string{"User-Agent: ", "X-Real-IP: ", "Accept-Language: ", "Host: "}
suffixes := []string{"bot", "crawler", "spider", "scraper", "Mozilla", "Chrome", "Safari", "Firefox"}
variations := []string{"", "/1.0", "/2.0", " (compatible)", " (Windows)", " (Linux)", " (Mac)"}
stressCount := 0
for _, prefix := range prefixes {
for _, suffix := range suffixes {
for _, variation := range variations {
for i := 0; i < 100; i++ {
input := fmt.Sprintf("%s%s%s-%d", prefix, suffix, variation, i)
hash := XXHash64sum(input)
if existing, exists := xxhashHashes[hash]; exists {
t.Errorf("XXHash collision in stress test: %q and %q both hash to %s", input, existing, hash)
}
xxhashHashes[hash] = input
stressCount++
}
}
}
}
t.Logf("Stress test 1: %d similar string variations, no collisions", stressCount)
// Test sequential patterns that might be problematic
patterns := []string{
"192.168.1.%d",
"10.0.0.%d",
"172.16.%d.1",
"challenge-%d",
"bot-rule-%d",
"policy-%016x",
"session-%016x",
}
seqCount := 0
for _, pattern := range patterns {
for i := 0; i < 10000; i++ {
input := fmt.Sprintf(pattern, i)
hash := XXHash64sum(input)
if existing, exists := xxhashHashes[hash]; exists {
t.Errorf("XXHash collision in sequential test: %q and %q both hash to %s", input, existing, hash)
}
xxhashHashes[hash] = input
seqCount++
}
}
t.Logf("Stress test 2: %d sequential patterns, no collisions", seqCount)
totalInputs := len(allInputs) + stressCount + seqCount
t.Logf("TOTAL: Tested %d inputs across realistic scenarios - NO COLLISIONS", totalInputs)
}
// Verify xxhash output works as cache keys
func TestXXHashFormat(t *testing.T) {
testCases := []string{
"short",
"",
"very long string with lots of content that might be used in policy checking and other internal hashing scenarios",
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
}
for _, input := range testCases {
hash := XXHash64sum(input)
// Check it's valid hex
if len(hash) == 0 {
t.Errorf("Empty hash for input %q", input)
}
// xxhash is 64-bit so max 16 hex chars
if len(hash) > 16 {
t.Errorf("Hash too long for input %q: %s (length %d)", input, hash, len(hash))
}
// Make sure it's all hex characters
for _, char := range hash {
if !((char >= '0' && char <= '9') || (char >= 'a' && char <= 'f')) {
t.Errorf("Non-hex character %c in hash %s for input %q", char, hash, input)
}
}
t.Logf("Input: %q -> Hash: %s", input, hash)
}
}