From 2904ff974bc5bb0256a941436046f1d3b364559b Mon Sep 17 00:00:00 2001 From: Jason Cameron Date: Fri, 13 Jun 2025 09:53:10 -0400 Subject: [PATCH] refactor(ogtags): optimize URL construction and memory allocations (#647) * refactor(ogtags): optimize URL construction and memory allocations * test(ogtags): add benchmarks and memory usage tests for OGTagCache * refactor(ogtags): optimize OGTags subsystem to reduce allocations and improve request runtime by up to 66% * Update docs/docs/CHANGELOG.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Jason Cameron * refactor(ogtags): optimize URL string construction to reduce allocations * Update internal/ogtags/ogtags.go Co-authored-by: Xe Iaso Signed-off-by: Jason Cameron * test(ogtags): add fuzz tests for getTarget and extractOGTags functions * fix(ogtags): update memory calculation logic Prev it would say that we had allocated 18pb === RUN TestMemoryUsage mem_test.go:107: Memory allocated for 10k getTarget calls: 18014398509481904.00 KB mem_test.go:135: Memory allocated for 1k extractOGTags calls: 18014398509481978.00 Now it's fixed with === RUN TestMemoryUsage mem_test.go:109: Memory allocated for 10k getTarget calls: mem_test.go:110: Total: 630.56 KB (0.62 MB) mem_test.go:111: Per operation: 64.57 bytes mem_test.go:140: Memory allocated for 1k extractOGTags calls: mem_test.go:141: Total: 328.17 KB (0.32 MB) mem_test.go:142: Per operation: 336.05 bytes * refactor(ogtags): optimize meta tag extraction for improved performance * Update metadata check-spelling run (pull_request) for json/ogmem Signed-off-by: check-spelling-bot on-behalf-of: @check-spelling * chore: update CHANGELOG for recent optimizations and version bump * refactor: improve URL construction and meta tag extraction logic * style: cleanup fuzz tests --------- Signed-off-by: Jason Cameron Signed-off-by: check-spelling-bot Signed-off-by: Jason Cameron Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Xe Iaso --- .github/actions/spelling/expect.txt | 5 +- docs/docs/CHANGELOG.md | 1 + internal/ogtags/mem_test.go | 148 +++++++++++++ internal/ogtags/ogtags.go | 54 +++-- internal/ogtags/ogtags_fuzz_test.go | 308 ++++++++++++++++++++++++++++ internal/ogtags/parse.go | 52 +++-- 6 files changed, 519 insertions(+), 49 deletions(-) create mode 100644 internal/ogtags/mem_test.go create mode 100644 internal/ogtags/ogtags_fuzz_test.go diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt index 7d0e6ed..60d917f 100644 --- a/.github/actions/spelling/expect.txt +++ b/.github/actions/spelling/expect.txt @@ -49,6 +49,7 @@ coreutils Cotoyogi CRDs crt +Cscript daemonizing DDOS Debian @@ -69,7 +70,6 @@ duckduckbot eerror ellenjoe enbyware -euo everyones evilbot evilsite @@ -108,6 +108,7 @@ hebis hec hmc hostable +htmlc htmx httpdebug hypertext @@ -119,7 +120,6 @@ imgproxy inp iss isset -itv ivh Jenomis JGit @@ -249,6 +249,7 @@ traefik uberspace unixhttpd unmarshal +unparseable uuidgen uvx UXP diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md index 6be9207..8147866 100644 --- a/docs/docs/CHANGELOG.md +++ b/docs/docs/CHANGELOG.md @@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Implement a no-JS challenge method: [`metarefresh`](./admin/configuration/challenges/metarefresh.mdx) ([#95](https://github.com/TecharoHQ/anubis/issues/95)) - Bump AI-robots.txt to version 1.34 - Make progress bar styling more compatible (UXP, etc) +- Optimized the OGTags subsystem with reduced allocations and runtime per request by up to 66% - Add `--strip-base-prefix` flag/envvar to strip the base prefix from request paths when forwarding to target servers ## v1.19.1: Jenomis cen Lexentale - Echo 1 diff --git a/internal/ogtags/mem_test.go b/internal/ogtags/mem_test.go new file mode 100644 index 0000000..9db4d68 --- /dev/null +++ b/internal/ogtags/mem_test.go @@ -0,0 +1,148 @@ +package ogtags + +import ( + "golang.org/x/net/html" + "net/url" + "runtime" + "strings" + "testing" +) + +func BenchmarkGetTarget(b *testing.B) { + tests := []struct { + name string + target string + paths []string + }{ + { + name: "HTTP", + target: "http://example.com", + paths: []string{"/", "/path", "/path/to/resource", "/path?query=1&foo=bar"}, + }, + { + name: "Unix", + target: "unix:///var/run/app.sock", + paths: []string{"/", "/api/endpoint", "/api/endpoint?param=value"}, + }, + } + + for _, tt := range tests { + b.Run(tt.name, func(b *testing.B) { + cache := NewOGTagCache(tt.target, false, 0, false) + urls := make([]*url.URL, len(tt.paths)) + for i, path := range tt.paths { + u, _ := url.Parse(path) + urls[i] = u + } + + b.ResetTimer() + b.ReportAllocs() + + for i := 0; i < b.N; i++ { + _ = cache.getTarget(urls[i%len(urls)]) + } + }) + } +} + +func BenchmarkExtractOGTags(b *testing.B) { + htmlSamples := []string{ + ` + + + + `, + ` + + + + + + + + +

Content

`, + } + + cache := NewOGTagCache("http://example.com", false, 0, false) + docs := make([]*html.Node, len(htmlSamples)) + + for i, sample := range htmlSamples { + doc, _ := html.Parse(strings.NewReader(sample)) + docs[i] = doc + } + + b.ResetTimer() + b.ReportAllocs() + + for i := 0; i < b.N; i++ { + _ = cache.extractOGTags(docs[i%len(docs)]) + } +} + +// Memory usage test +func TestMemoryUsage(t *testing.T) { + cache := NewOGTagCache("http://example.com", false, 0, false) + + // Force GC and wait for it to complete + runtime.GC() + + var m1 runtime.MemStats + runtime.ReadMemStats(&m1) + + // Run getTarget many times + u, _ := url.Parse("/path/to/resource?query=1&foo=bar&baz=qux") + for i := 0; i < 10000; i++ { + _ = cache.getTarget(u) + } + + // Force GC after operations + runtime.GC() + + var m2 runtime.MemStats + runtime.ReadMemStats(&m2) + + allocatedBytes := int64(m2.TotalAlloc) - int64(m1.TotalAlloc) + allocatedKB := float64(allocatedBytes) / 1024.0 + allocatedPerOp := float64(allocatedBytes) / 10000.0 + + t.Logf("Memory allocated for 10k getTarget calls:") + t.Logf(" Total: %.2f KB (%.2f MB)", allocatedKB, allocatedKB/1024.0) + t.Logf(" Per operation: %.2f bytes", allocatedPerOp) + + // Test extractOGTags memory usage + htmlDoc := ` + + + + + + + + ` + + doc, _ := html.Parse(strings.NewReader(htmlDoc)) + + runtime.GC() + runtime.ReadMemStats(&m1) + + for i := 0; i < 1000; i++ { + _ = cache.extractOGTags(doc) + } + + runtime.GC() + runtime.ReadMemStats(&m2) + + allocatedBytes = int64(m2.TotalAlloc) - int64(m1.TotalAlloc) + allocatedKB = float64(allocatedBytes) / 1024.0 + allocatedPerOp = float64(allocatedBytes) / 1000.0 + + t.Logf("Memory allocated for 1k extractOGTags calls:") + t.Logf(" Total: %.2f KB (%.2f MB)", allocatedKB, allocatedKB/1024.0) + t.Logf(" Per operation: %.2f bytes", allocatedPerOp) + + // Sanity checks + if allocatedPerOp > 10000 { + t.Errorf("extractOGTags allocating too much memory per operation: %.2f bytes", allocatedPerOp) + } +} diff --git a/internal/ogtags/ogtags.go b/internal/ogtags/ogtags.go index 0301742..3779c06 100644 --- a/internal/ogtags/ogtags.go +++ b/internal/ogtags/ogtags.go @@ -13,8 +13,11 @@ import ( ) const ( - maxContentLength = 16 << 20 // 16 MiB in bytes, if there is a reasonable reason that you need more than this...Why? + maxContentLength = 8 << 20 // 8 MiB is enough for anyone httpTimeout = 5 * time.Second /*todo: make this configurable?*/ + + schemeSeparatorLength = 3 // Length of "://" + querySeparatorLength = 1 // Length of "?" for query strings ) type OGTagCache struct { @@ -26,11 +29,13 @@ type OGTagCache struct { ogTimeToLive time.Duration ogCacheConsiderHost bool ogPassthrough bool + + // Pre-built strings for optimization + unixPrefix string // "http://unix" } func NewOGTagCache(target string, ogPassthrough bool, ogTimeToLive time.Duration, ogTagsConsiderHost bool) *OGTagCache { // Predefined approved tags and prefixes - // In the future, these could come from configuration defaultApprovedTags := []string{"description", "keywords", "author"} defaultApprovedPrefixes := []string{"og:", "twitter:", "fediverse:"} @@ -71,37 +76,50 @@ func NewOGTagCache(target string, ogPassthrough bool, ogTimeToLive time.Duration return &OGTagCache{ cache: decaymap.New[string, map[string]string](), - targetURL: parsedTargetURL, // Store the parsed URL + targetURL: parsedTargetURL, ogPassthrough: ogPassthrough, ogTimeToLive: ogTimeToLive, - ogCacheConsiderHost: ogTagsConsiderHost, // todo: refactor to be a separate struct + ogCacheConsiderHost: ogTagsConsiderHost, approvedTags: defaultApprovedTags, approvedPrefixes: defaultApprovedPrefixes, client: client, + unixPrefix: "http://unix", } } // getTarget constructs the target URL string for fetching OG tags. -// For Unix sockets, it creates a "fake" HTTP URL that the custom dialer understands. +// Optimized to minimize allocations by building strings directly. func (c *OGTagCache) getTarget(u *url.URL) string { + var escapedPath = u.EscapedPath() // will cause an allocation if path contains special characters if c.targetURL.Scheme == "unix" { - // The custom dialer ignores the host, but we need a valid http URL structure. - // Use "unix" as a placeholder host. Path and Query from original request are appended. - fakeURL := &url.URL{ - Scheme: "http", // Scheme must be http/https for client.Get - Host: "unix", // Arbitrary host, ignored by custom dialer - Path: u.Path, - RawQuery: u.RawQuery, + // Build URL string directly without creating intermediate URL object + var sb strings.Builder + sb.Grow(len(c.unixPrefix) + len(escapedPath) + len(u.RawQuery) + querySeparatorLength) // Pre-allocate + sb.WriteString(c.unixPrefix) + sb.WriteString(escapedPath) + if u.RawQuery != "" { + sb.WriteByte('?') + sb.WriteString(u.RawQuery) } - return fakeURL.String() + return sb.String() } - // For regular http/https targets - target := *c.targetURL // Make a copy - target.Path = u.Path - target.RawQuery = u.RawQuery - return target.String() + // For regular http/https targets, build URL string directly + var sb strings.Builder + // Pre-calculate size: scheme + "://" + host + path + "?" + query + estimatedSize := len(c.targetURL.Scheme) + schemeSeparatorLength + len(c.targetURL.Host) + len(escapedPath) + len(u.RawQuery) + querySeparatorLength + sb.Grow(estimatedSize) + sb.WriteString(c.targetURL.Scheme) + sb.WriteString("://") + sb.WriteString(c.targetURL.Host) + sb.WriteString(escapedPath) + if u.RawQuery != "" { + sb.WriteByte('?') + sb.WriteString(u.RawQuery) + } + + return sb.String() } func (c *OGTagCache) Cleanup() { diff --git a/internal/ogtags/ogtags_fuzz_test.go b/internal/ogtags/ogtags_fuzz_test.go new file mode 100644 index 0000000..c6e40fa --- /dev/null +++ b/internal/ogtags/ogtags_fuzz_test.go @@ -0,0 +1,308 @@ +package ogtags + +import ( + "golang.org/x/net/html" + "net/url" + "strings" + "testing" + "unicode/utf8" +) + +// FuzzGetTarget tests getTarget with various inputs +func FuzzGetTarget(f *testing.F) { + // Seed corpus with interesting test cases + testCases := []struct { + target string + path string + query string + }{ + {"http://example.com", "/", ""}, + {"http://example.com", "/path", "q=1"}, + {"unix:///tmp/socket", "/api", "key=value"}, + {"https://example.com:8080", "/path/to/resource", "a=1&b=2"}, + {"http://example.com", "/path with spaces", "q=hello world"}, + {"http://example.com", "/path/❤️/emoji", "emoji=🎉"}, + {"http://example.com", "/path/../../../etc/passwd", ""}, + {"http://example.com", "/path%2F%2E%2E%2F", "q=%3Cscript%3E"}, + {"unix:///var/run/app.sock", "/../../etc/passwd", ""}, + {"http://[::1]:8080", "/ipv6", "test=1"}, + {"http://example.com", strings.Repeat("/very/long/path", 100), strings.Repeat("param=value&", 100)}, + {"http://example.com", "/path%20with%20encoded", "q=%20encoded%20"}, + {"http://example.com", "/пример/кириллица", "q=тест"}, + {"http://example.com", "/中文/路径", "查询=值"}, + {"", "/path", "q=1"}, // Empty target + } + + for _, tc := range testCases { + f.Add(tc.target, tc.path, tc.query) + } + + f.Fuzz(func(t *testing.T, target, path, query string) { + // Skip invalid UTF-8 to focus on realistic inputs + if !utf8.ValidString(target) || !utf8.ValidString(path) || !utf8.ValidString(query) { + t.Skip() + } + + // Create cache - should not panic + cache := NewOGTagCache(target, false, 0, false) + + // Create URL + u := &url.URL{ + Path: path, + RawQuery: query, + } + + // Call getTarget - should not panic + result := cache.getTarget(u) + + // Basic validation + if result == "" { + t.Errorf("getTarget returned empty string for target=%q, path=%q, query=%q", target, path, query) + } + + // Verify result is a valid URL (for non-empty targets) + if target != "" { + parsedResult, err := url.Parse(result) + if err != nil { + t.Errorf("getTarget produced invalid URL %q: %v", result, err) + } else { + // For unix sockets, verify the scheme is http + if strings.HasPrefix(target, "unix:") && parsedResult.Scheme != "http" { + t.Errorf("Unix socket URL should have http scheme, got %q", parsedResult.Scheme) + } + } + } + + // Ensure no memory corruption by calling multiple times + for i := 0; i < 3; i++ { + result2 := cache.getTarget(u) + if result != result2 { + t.Errorf("getTarget not deterministic: %q != %q", result, result2) + } + } + }) +} + +// FuzzExtractOGTags tests extractOGTags with various HTML inputs +func FuzzExtractOGTags(f *testing.F) { + // Seed corpus with interesting HTML cases + htmlCases := []string{ + ``, + ``, + `` + strings.Repeat(``, 1000) + ``, + ``, + ``, + ``, + ``, + ``, + ``, + ``, + ``, + ``, + `` + strings.Repeat(`
`, 1000) + `` + strings.Repeat(`
`, 1000) + ``, + ``, + ``, + ``, + ``, // No content + ``, // Empty HTML + ``, + ``, + ``, + ``, + } + + for _, htmlc := range htmlCases { + f.Add(htmlc) + } + + f.Fuzz(func(t *testing.T, htmlContent string) { + // Skip invalid UTF-8 + if !utf8.ValidString(htmlContent) { + t.Skip() + } + + // Parse HTML - may fail on invalid input + doc, err := html.Parse(strings.NewReader(htmlContent)) + if err != nil { + // This is expected for malformed HTML + return + } + + cache := NewOGTagCache("http://example.com", false, 0, false) + + // Should not panic + tags := cache.extractOGTags(doc) + + // Validate results + for property, content := range tags { + // Ensure property is approved + approved := false + for _, prefix := range cache.approvedPrefixes { + if strings.HasPrefix(property, prefix) { + approved = true + break + } + } + if !approved { + for _, tag := range cache.approvedTags { + if property == tag { + approved = true + break + } + } + } + if !approved { + t.Errorf("Unapproved property %q was extracted", property) + } + + // Ensure content is valid string + if !utf8.ValidString(content) { + t.Errorf("Invalid UTF-8 in content for property %q", property) + } + } + + // Test determinism + tags2 := cache.extractOGTags(doc) + if len(tags) != len(tags2) { + t.Errorf("extractOGTags not deterministic: different lengths %d != %d", len(tags), len(tags2)) + } + for k, v := range tags { + if tags2[k] != v { + t.Errorf("extractOGTags not deterministic: %q=%q != %q=%q", k, v, k, tags2[k]) + } + } + }) +} + +// FuzzGetTargetRoundTrip tests that getTarget produces valid URLs that can be parsed back +func FuzzGetTargetRoundTrip(f *testing.F) { + f.Add("http://example.com", "/path/to/resource", "key=value&foo=bar") + f.Add("unix:///tmp/socket", "/api/endpoint", "param=test") + + f.Fuzz(func(t *testing.T, target, path, query string) { + if !utf8.ValidString(target) || !utf8.ValidString(path) || !utf8.ValidString(query) { + t.Skip() + } + + cache := NewOGTagCache(target, false, 0, false) + u := &url.URL{Path: path, RawQuery: query} + + result := cache.getTarget(u) + if result == "" { + return + } + + // Parse the result back + parsed, err := url.Parse(result) + if err != nil { + t.Errorf("getTarget produced unparseable URL: %v", err) + return + } + + // For non-unix targets, verify path preservation (accounting for encoding) + if !strings.HasPrefix(target, "unix:") && target != "" { + // The paths should match after normalization + expectedPath := u.EscapedPath() + if parsed.EscapedPath() != expectedPath { + t.Errorf("Path not preserved: want %q, got %q", expectedPath, parsed.EscapedPath()) + } + + // Query should be preserved exactly + if parsed.RawQuery != query { + t.Errorf("Query not preserved: want %q, got %q", query, parsed.RawQuery) + } + } + }) +} + +// FuzzExtractMetaTagInfo tests the extractMetaTagInfo function directly +func FuzzExtractMetaTagInfo(f *testing.F) { + // Seed with various attribute combinations + f.Add("og:title", "Test Title", "property") + f.Add("keywords", "test,keywords", "name") + f.Add("og:description", "A description with \"quotes\"", "property") + f.Add("twitter:card", "summary", "property") + f.Add("unknown:tag", "Should be filtered", "property") + f.Add("", "Content without property", "property") + f.Add("og:title", "", "property") // Property without content + + f.Fuzz(func(t *testing.T, propertyValue, contentValue, propertyKey string) { + if !utf8.ValidString(propertyValue) || !utf8.ValidString(contentValue) || !utf8.ValidString(propertyKey) { + t.Skip() + } + + // Create a meta node + node := &html.Node{ + Type: html.ElementNode, + Data: "meta", + Attr: []html.Attribute{ + {Key: propertyKey, Val: propertyValue}, + {Key: "content", Val: contentValue}, + }, + } + + cache := NewOGTagCache("http://example.com", false, 0, false) + + // Should not panic + property, content := cache.extractMetaTagInfo(node) + + // If property is returned, it must be approved + if property != "" { + approved := false + for _, prefix := range cache.approvedPrefixes { + if strings.HasPrefix(property, prefix) { + approved = true + break + } + } + if !approved { + for _, tag := range cache.approvedTags { + if property == tag { + approved = true + break + } + } + } + if !approved { + t.Errorf("extractMetaTagInfo returned unapproved property: %q", property) + } + } + + // Content should match input if property is approved + if property != "" && content != contentValue { + t.Errorf("Content mismatch: want %q, got %q", contentValue, content) + } + }) +} + +// Benchmark comparison for the fuzzed scenarios +func BenchmarkFuzzedGetTarget(b *testing.B) { + // Test with various challenging inputs found during fuzzing + inputs := []struct { + name string + target string + path string + query string + }{ + {"Simple", "http://example.com", "/api", "k=v"}, + {"LongPath", "http://example.com", strings.Repeat("/segment", 50), ""}, + {"LongQuery", "http://example.com", "/", strings.Repeat("param=value&", 50)}, + {"Unicode", "http://example.com", "/путь/路径/path", "q=значение"}, + {"Encoded", "http://example.com", "/path%20with%20spaces", "q=%3Cscript%3E"}, + {"Unix", "unix:///tmp/socket.sock", "/api/v1/resource", "id=123&format=json"}, + } + + for _, input := range inputs { + b.Run(input.name, func(b *testing.B) { + cache := NewOGTagCache(input.target, false, 0, false) + u := &url.URL{Path: input.path, RawQuery: input.query} + + b.ResetTimer() + b.ReportAllocs() + + for i := 0; i < b.N; i++ { + _ = cache.getTarget(u) + } + }) + } +} diff --git a/internal/ogtags/parse.go b/internal/ogtags/parse.go index 8828e59..c21fd79 100644 --- a/internal/ogtags/parse.go +++ b/internal/ogtags/parse.go @@ -12,15 +12,12 @@ func (c *OGTagCache) extractOGTags(doc *html.Node) map[string]string { var traverseNodes func(*html.Node) traverseNodes = func(n *html.Node) { - // isOGMetaTag only checks if it's a tag. - // The actual filtering happens in extractMetaTagInfo now. if isOGMetaTag(n) { property, content := c.extractMetaTagInfo(n) if property != "" { ogTags[property] = content } } - for child := n.FirstChild; child != nil; child = child.NextSibling { traverseNodes(child) } @@ -39,43 +36,40 @@ func isOGMetaTag(n *html.Node) bool { } // extractMetaTagInfo extracts property and content from a meta tag -// *and* checks if the property is approved. -// Returns empty property string if the tag is not approved. func (c *OGTagCache) extractMetaTagInfo(n *html.Node) (property, content string) { - var rawProperty string // Store the property found before approval check + var propertyKey string + // Single pass through attributes, using range to avoid bounds checking for _, attr := range n.Attr { - if attr.Key == "property" || attr.Key == "name" { - rawProperty = attr.Val - } - if attr.Key == "content" { + switch attr.Key { + case "property", "name": + propertyKey = attr.Val + case "content": content = attr.Val } - } - - // Check if the rawProperty is approved - isApproved := false - for _, prefix := range c.approvedPrefixes { - if strings.HasPrefix(rawProperty, prefix) { - isApproved = true + // Early exit if we have both + if propertyKey != "" && content != "" { break } } - // Check exact approved tags if not already approved by prefix - if !isApproved { - for _, tag := range c.approvedTags { - if rawProperty == tag { - isApproved = true - break - } + + if propertyKey == "" { + return "", content + } + + // Check prefixes first (more common case) + for _, prefix := range c.approvedPrefixes { + if strings.HasPrefix(propertyKey, prefix) { + return propertyKey, content } } - // Only return the property if it's approved - if isApproved { - property = rawProperty + // Check exact matches + for _, tag := range c.approvedTags { + if propertyKey == tag { + return propertyKey, content + } } - // Content is returned regardless, but property will be "" if not approved - return property, content + return "", content }