mirror of
https://github.com/TecharoHQ/anubis.git
synced 2025-09-07 11:44:55 -04:00
refactor(ogtags): optimize URL construction and memory allocations (#647)
* refactor(ogtags): optimize URL construction and memory allocations * test(ogtags): add benchmarks and memory usage tests for OGTagCache * refactor(ogtags): optimize OGTags subsystem to reduce allocations and improve request runtime by up to 66% * Update docs/docs/CHANGELOG.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Jason Cameron <jasoncameron.all@gmail.com> * refactor(ogtags): optimize URL string construction to reduce allocations * Update internal/ogtags/ogtags.go Co-authored-by: Xe Iaso <me@xeiaso.net> Signed-off-by: Jason Cameron <jasoncameron.all@gmail.com> * test(ogtags): add fuzz tests for getTarget and extractOGTags functions * fix(ogtags): update memory calculation logic Prev it would say that we had allocated 18pb === RUN TestMemoryUsage mem_test.go:107: Memory allocated for 10k getTarget calls: 18014398509481904.00 KB mem_test.go:135: Memory allocated for 1k extractOGTags calls: 18014398509481978.00 Now it's fixed with === RUN TestMemoryUsage mem_test.go:109: Memory allocated for 10k getTarget calls: mem_test.go:110: Total: 630.56 KB (0.62 MB) mem_test.go:111: Per operation: 64.57 bytes mem_test.go:140: Memory allocated for 1k extractOGTags calls: mem_test.go:141: Total: 328.17 KB (0.32 MB) mem_test.go:142: Per operation: 336.05 bytes * refactor(ogtags): optimize meta tag extraction for improved performance * Update metadata check-spelling run (pull_request) for json/ogmem Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com> on-behalf-of: @check-spelling <check-spelling-bot@check-spelling.dev> * chore: update CHANGELOG for recent optimizations and version bump * refactor: improve URL construction and meta tag extraction logic * style: cleanup fuzz tests --------- Signed-off-by: Jason Cameron <jasoncameron.all@gmail.com> Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com> Signed-off-by: Jason Cameron <git@jasoncameron.dev> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Xe Iaso <me@xeiaso.net>
This commit is contained in:
parent
3b3080d497
commit
2904ff974b
5
.github/actions/spelling/expect.txt
vendored
5
.github/actions/spelling/expect.txt
vendored
@ -49,6 +49,7 @@ coreutils
|
|||||||
Cotoyogi
|
Cotoyogi
|
||||||
CRDs
|
CRDs
|
||||||
crt
|
crt
|
||||||
|
Cscript
|
||||||
daemonizing
|
daemonizing
|
||||||
DDOS
|
DDOS
|
||||||
Debian
|
Debian
|
||||||
@ -69,7 +70,6 @@ duckduckbot
|
|||||||
eerror
|
eerror
|
||||||
ellenjoe
|
ellenjoe
|
||||||
enbyware
|
enbyware
|
||||||
euo
|
|
||||||
everyones
|
everyones
|
||||||
evilbot
|
evilbot
|
||||||
evilsite
|
evilsite
|
||||||
@ -108,6 +108,7 @@ hebis
|
|||||||
hec
|
hec
|
||||||
hmc
|
hmc
|
||||||
hostable
|
hostable
|
||||||
|
htmlc
|
||||||
htmx
|
htmx
|
||||||
httpdebug
|
httpdebug
|
||||||
hypertext
|
hypertext
|
||||||
@ -119,7 +120,6 @@ imgproxy
|
|||||||
inp
|
inp
|
||||||
iss
|
iss
|
||||||
isset
|
isset
|
||||||
itv
|
|
||||||
ivh
|
ivh
|
||||||
Jenomis
|
Jenomis
|
||||||
JGit
|
JGit
|
||||||
@ -249,6 +249,7 @@ traefik
|
|||||||
uberspace
|
uberspace
|
||||||
unixhttpd
|
unixhttpd
|
||||||
unmarshal
|
unmarshal
|
||||||
|
unparseable
|
||||||
uuidgen
|
uuidgen
|
||||||
uvx
|
uvx
|
||||||
UXP
|
UXP
|
||||||
|
@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
- Implement a no-JS challenge method: [`metarefresh`](./admin/configuration/challenges/metarefresh.mdx) ([#95](https://github.com/TecharoHQ/anubis/issues/95))
|
- Implement a no-JS challenge method: [`metarefresh`](./admin/configuration/challenges/metarefresh.mdx) ([#95](https://github.com/TecharoHQ/anubis/issues/95))
|
||||||
- Bump AI-robots.txt to version 1.34
|
- Bump AI-robots.txt to version 1.34
|
||||||
- Make progress bar styling more compatible (UXP, etc)
|
- Make progress bar styling more compatible (UXP, etc)
|
||||||
|
- Optimized the OGTags subsystem with reduced allocations and runtime per request by up to 66%
|
||||||
- Add `--strip-base-prefix` flag/envvar to strip the base prefix from request paths when forwarding to target servers
|
- Add `--strip-base-prefix` flag/envvar to strip the base prefix from request paths when forwarding to target servers
|
||||||
|
|
||||||
## v1.19.1: Jenomis cen Lexentale - Echo 1
|
## v1.19.1: Jenomis cen Lexentale - Echo 1
|
||||||
|
148
internal/ogtags/mem_test.go
Normal file
148
internal/ogtags/mem_test.go
Normal file
@ -0,0 +1,148 @@
|
|||||||
|
package ogtags
|
||||||
|
|
||||||
|
import (
|
||||||
|
"golang.org/x/net/html"
|
||||||
|
"net/url"
|
||||||
|
"runtime"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func BenchmarkGetTarget(b *testing.B) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
target string
|
||||||
|
paths []string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "HTTP",
|
||||||
|
target: "http://example.com",
|
||||||
|
paths: []string{"/", "/path", "/path/to/resource", "/path?query=1&foo=bar"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Unix",
|
||||||
|
target: "unix:///var/run/app.sock",
|
||||||
|
paths: []string{"/", "/api/endpoint", "/api/endpoint?param=value"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
b.Run(tt.name, func(b *testing.B) {
|
||||||
|
cache := NewOGTagCache(tt.target, false, 0, false)
|
||||||
|
urls := make([]*url.URL, len(tt.paths))
|
||||||
|
for i, path := range tt.paths {
|
||||||
|
u, _ := url.Parse(path)
|
||||||
|
urls[i] = u
|
||||||
|
}
|
||||||
|
|
||||||
|
b.ResetTimer()
|
||||||
|
b.ReportAllocs()
|
||||||
|
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = cache.getTarget(urls[i%len(urls)])
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkExtractOGTags(b *testing.B) {
|
||||||
|
htmlSamples := []string{
|
||||||
|
`<html><head>
|
||||||
|
<meta property="og:title" content="Test Title">
|
||||||
|
<meta property="og:description" content="Test Description">
|
||||||
|
<meta name="keywords" content="test,keywords">
|
||||||
|
</head><body></body></html>`,
|
||||||
|
`<html><head>
|
||||||
|
<meta property="og:title" content="Page Title">
|
||||||
|
<meta property="og:type" content="website">
|
||||||
|
<meta property="og:url" content="https://example.com">
|
||||||
|
<meta property="og:image" content="https://example.com/image.jpg">
|
||||||
|
<meta property="twitter:card" content="summary_large_image">
|
||||||
|
<meta property="twitter:title" content="Twitter Title">
|
||||||
|
<meta name="description" content="Page description">
|
||||||
|
<meta name="author" content="John Doe">
|
||||||
|
</head><body><div><p>Content</p></div></body></html>`,
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := NewOGTagCache("http://example.com", false, 0, false)
|
||||||
|
docs := make([]*html.Node, len(htmlSamples))
|
||||||
|
|
||||||
|
for i, sample := range htmlSamples {
|
||||||
|
doc, _ := html.Parse(strings.NewReader(sample))
|
||||||
|
docs[i] = doc
|
||||||
|
}
|
||||||
|
|
||||||
|
b.ResetTimer()
|
||||||
|
b.ReportAllocs()
|
||||||
|
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = cache.extractOGTags(docs[i%len(docs)])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Memory usage test
|
||||||
|
func TestMemoryUsage(t *testing.T) {
|
||||||
|
cache := NewOGTagCache("http://example.com", false, 0, false)
|
||||||
|
|
||||||
|
// Force GC and wait for it to complete
|
||||||
|
runtime.GC()
|
||||||
|
|
||||||
|
var m1 runtime.MemStats
|
||||||
|
runtime.ReadMemStats(&m1)
|
||||||
|
|
||||||
|
// Run getTarget many times
|
||||||
|
u, _ := url.Parse("/path/to/resource?query=1&foo=bar&baz=qux")
|
||||||
|
for i := 0; i < 10000; i++ {
|
||||||
|
_ = cache.getTarget(u)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Force GC after operations
|
||||||
|
runtime.GC()
|
||||||
|
|
||||||
|
var m2 runtime.MemStats
|
||||||
|
runtime.ReadMemStats(&m2)
|
||||||
|
|
||||||
|
allocatedBytes := int64(m2.TotalAlloc) - int64(m1.TotalAlloc)
|
||||||
|
allocatedKB := float64(allocatedBytes) / 1024.0
|
||||||
|
allocatedPerOp := float64(allocatedBytes) / 10000.0
|
||||||
|
|
||||||
|
t.Logf("Memory allocated for 10k getTarget calls:")
|
||||||
|
t.Logf(" Total: %.2f KB (%.2f MB)", allocatedKB, allocatedKB/1024.0)
|
||||||
|
t.Logf(" Per operation: %.2f bytes", allocatedPerOp)
|
||||||
|
|
||||||
|
// Test extractOGTags memory usage
|
||||||
|
htmlDoc := `<html><head>
|
||||||
|
<meta property="og:title" content="Test Title">
|
||||||
|
<meta property="og:description" content="Test Description">
|
||||||
|
<meta property="og:image" content="https://example.com/image.jpg">
|
||||||
|
<meta property="twitter:card" content="summary">
|
||||||
|
<meta name="keywords" content="test,keywords,example">
|
||||||
|
<meta name="author" content="Test Author">
|
||||||
|
<meta property="unknown:tag" content="Should be ignored">
|
||||||
|
</head><body></body></html>`
|
||||||
|
|
||||||
|
doc, _ := html.Parse(strings.NewReader(htmlDoc))
|
||||||
|
|
||||||
|
runtime.GC()
|
||||||
|
runtime.ReadMemStats(&m1)
|
||||||
|
|
||||||
|
for i := 0; i < 1000; i++ {
|
||||||
|
_ = cache.extractOGTags(doc)
|
||||||
|
}
|
||||||
|
|
||||||
|
runtime.GC()
|
||||||
|
runtime.ReadMemStats(&m2)
|
||||||
|
|
||||||
|
allocatedBytes = int64(m2.TotalAlloc) - int64(m1.TotalAlloc)
|
||||||
|
allocatedKB = float64(allocatedBytes) / 1024.0
|
||||||
|
allocatedPerOp = float64(allocatedBytes) / 1000.0
|
||||||
|
|
||||||
|
t.Logf("Memory allocated for 1k extractOGTags calls:")
|
||||||
|
t.Logf(" Total: %.2f KB (%.2f MB)", allocatedKB, allocatedKB/1024.0)
|
||||||
|
t.Logf(" Per operation: %.2f bytes", allocatedPerOp)
|
||||||
|
|
||||||
|
// Sanity checks
|
||||||
|
if allocatedPerOp > 10000 {
|
||||||
|
t.Errorf("extractOGTags allocating too much memory per operation: %.2f bytes", allocatedPerOp)
|
||||||
|
}
|
||||||
|
}
|
@ -13,8 +13,11 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
maxContentLength = 16 << 20 // 16 MiB in bytes, if there is a reasonable reason that you need more than this...Why?
|
maxContentLength = 8 << 20 // 8 MiB is enough for anyone
|
||||||
httpTimeout = 5 * time.Second /*todo: make this configurable?*/
|
httpTimeout = 5 * time.Second /*todo: make this configurable?*/
|
||||||
|
|
||||||
|
schemeSeparatorLength = 3 // Length of "://"
|
||||||
|
querySeparatorLength = 1 // Length of "?" for query strings
|
||||||
)
|
)
|
||||||
|
|
||||||
type OGTagCache struct {
|
type OGTagCache struct {
|
||||||
@ -26,11 +29,13 @@ type OGTagCache struct {
|
|||||||
ogTimeToLive time.Duration
|
ogTimeToLive time.Duration
|
||||||
ogCacheConsiderHost bool
|
ogCacheConsiderHost bool
|
||||||
ogPassthrough bool
|
ogPassthrough bool
|
||||||
|
|
||||||
|
// Pre-built strings for optimization
|
||||||
|
unixPrefix string // "http://unix"
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewOGTagCache(target string, ogPassthrough bool, ogTimeToLive time.Duration, ogTagsConsiderHost bool) *OGTagCache {
|
func NewOGTagCache(target string, ogPassthrough bool, ogTimeToLive time.Duration, ogTagsConsiderHost bool) *OGTagCache {
|
||||||
// Predefined approved tags and prefixes
|
// Predefined approved tags and prefixes
|
||||||
// In the future, these could come from configuration
|
|
||||||
defaultApprovedTags := []string{"description", "keywords", "author"}
|
defaultApprovedTags := []string{"description", "keywords", "author"}
|
||||||
defaultApprovedPrefixes := []string{"og:", "twitter:", "fediverse:"}
|
defaultApprovedPrefixes := []string{"og:", "twitter:", "fediverse:"}
|
||||||
|
|
||||||
@ -71,37 +76,50 @@ func NewOGTagCache(target string, ogPassthrough bool, ogTimeToLive time.Duration
|
|||||||
|
|
||||||
return &OGTagCache{
|
return &OGTagCache{
|
||||||
cache: decaymap.New[string, map[string]string](),
|
cache: decaymap.New[string, map[string]string](),
|
||||||
targetURL: parsedTargetURL, // Store the parsed URL
|
targetURL: parsedTargetURL,
|
||||||
ogPassthrough: ogPassthrough,
|
ogPassthrough: ogPassthrough,
|
||||||
ogTimeToLive: ogTimeToLive,
|
ogTimeToLive: ogTimeToLive,
|
||||||
ogCacheConsiderHost: ogTagsConsiderHost, // todo: refactor to be a separate struct
|
ogCacheConsiderHost: ogTagsConsiderHost,
|
||||||
approvedTags: defaultApprovedTags,
|
approvedTags: defaultApprovedTags,
|
||||||
approvedPrefixes: defaultApprovedPrefixes,
|
approvedPrefixes: defaultApprovedPrefixes,
|
||||||
client: client,
|
client: client,
|
||||||
|
unixPrefix: "http://unix",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// getTarget constructs the target URL string for fetching OG tags.
|
// getTarget constructs the target URL string for fetching OG tags.
|
||||||
// For Unix sockets, it creates a "fake" HTTP URL that the custom dialer understands.
|
// Optimized to minimize allocations by building strings directly.
|
||||||
func (c *OGTagCache) getTarget(u *url.URL) string {
|
func (c *OGTagCache) getTarget(u *url.URL) string {
|
||||||
|
var escapedPath = u.EscapedPath() // will cause an allocation if path contains special characters
|
||||||
if c.targetURL.Scheme == "unix" {
|
if c.targetURL.Scheme == "unix" {
|
||||||
// The custom dialer ignores the host, but we need a valid http URL structure.
|
// Build URL string directly without creating intermediate URL object
|
||||||
// Use "unix" as a placeholder host. Path and Query from original request are appended.
|
var sb strings.Builder
|
||||||
fakeURL := &url.URL{
|
sb.Grow(len(c.unixPrefix) + len(escapedPath) + len(u.RawQuery) + querySeparatorLength) // Pre-allocate
|
||||||
Scheme: "http", // Scheme must be http/https for client.Get
|
sb.WriteString(c.unixPrefix)
|
||||||
Host: "unix", // Arbitrary host, ignored by custom dialer
|
sb.WriteString(escapedPath)
|
||||||
Path: u.Path,
|
if u.RawQuery != "" {
|
||||||
RawQuery: u.RawQuery,
|
sb.WriteByte('?')
|
||||||
|
sb.WriteString(u.RawQuery)
|
||||||
}
|
}
|
||||||
return fakeURL.String()
|
return sb.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
// For regular http/https targets
|
// For regular http/https targets, build URL string directly
|
||||||
target := *c.targetURL // Make a copy
|
var sb strings.Builder
|
||||||
target.Path = u.Path
|
// Pre-calculate size: scheme + "://" + host + path + "?" + query
|
||||||
target.RawQuery = u.RawQuery
|
estimatedSize := len(c.targetURL.Scheme) + schemeSeparatorLength + len(c.targetURL.Host) + len(escapedPath) + len(u.RawQuery) + querySeparatorLength
|
||||||
return target.String()
|
sb.Grow(estimatedSize)
|
||||||
|
|
||||||
|
sb.WriteString(c.targetURL.Scheme)
|
||||||
|
sb.WriteString("://")
|
||||||
|
sb.WriteString(c.targetURL.Host)
|
||||||
|
sb.WriteString(escapedPath)
|
||||||
|
if u.RawQuery != "" {
|
||||||
|
sb.WriteByte('?')
|
||||||
|
sb.WriteString(u.RawQuery)
|
||||||
|
}
|
||||||
|
|
||||||
|
return sb.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *OGTagCache) Cleanup() {
|
func (c *OGTagCache) Cleanup() {
|
||||||
|
308
internal/ogtags/ogtags_fuzz_test.go
Normal file
308
internal/ogtags/ogtags_fuzz_test.go
Normal file
@ -0,0 +1,308 @@
|
|||||||
|
package ogtags
|
||||||
|
|
||||||
|
import (
|
||||||
|
"golang.org/x/net/html"
|
||||||
|
"net/url"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
"unicode/utf8"
|
||||||
|
)
|
||||||
|
|
||||||
|
// FuzzGetTarget tests getTarget with various inputs
|
||||||
|
func FuzzGetTarget(f *testing.F) {
|
||||||
|
// Seed corpus with interesting test cases
|
||||||
|
testCases := []struct {
|
||||||
|
target string
|
||||||
|
path string
|
||||||
|
query string
|
||||||
|
}{
|
||||||
|
{"http://example.com", "/", ""},
|
||||||
|
{"http://example.com", "/path", "q=1"},
|
||||||
|
{"unix:///tmp/socket", "/api", "key=value"},
|
||||||
|
{"https://example.com:8080", "/path/to/resource", "a=1&b=2"},
|
||||||
|
{"http://example.com", "/path with spaces", "q=hello world"},
|
||||||
|
{"http://example.com", "/path/❤️/emoji", "emoji=🎉"},
|
||||||
|
{"http://example.com", "/path/../../../etc/passwd", ""},
|
||||||
|
{"http://example.com", "/path%2F%2E%2E%2F", "q=%3Cscript%3E"},
|
||||||
|
{"unix:///var/run/app.sock", "/../../etc/passwd", ""},
|
||||||
|
{"http://[::1]:8080", "/ipv6", "test=1"},
|
||||||
|
{"http://example.com", strings.Repeat("/very/long/path", 100), strings.Repeat("param=value&", 100)},
|
||||||
|
{"http://example.com", "/path%20with%20encoded", "q=%20encoded%20"},
|
||||||
|
{"http://example.com", "/пример/кириллица", "q=тест"},
|
||||||
|
{"http://example.com", "/中文/路径", "查询=值"},
|
||||||
|
{"", "/path", "q=1"}, // Empty target
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range testCases {
|
||||||
|
f.Add(tc.target, tc.path, tc.query)
|
||||||
|
}
|
||||||
|
|
||||||
|
f.Fuzz(func(t *testing.T, target, path, query string) {
|
||||||
|
// Skip invalid UTF-8 to focus on realistic inputs
|
||||||
|
if !utf8.ValidString(target) || !utf8.ValidString(path) || !utf8.ValidString(query) {
|
||||||
|
t.Skip()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create cache - should not panic
|
||||||
|
cache := NewOGTagCache(target, false, 0, false)
|
||||||
|
|
||||||
|
// Create URL
|
||||||
|
u := &url.URL{
|
||||||
|
Path: path,
|
||||||
|
RawQuery: query,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Call getTarget - should not panic
|
||||||
|
result := cache.getTarget(u)
|
||||||
|
|
||||||
|
// Basic validation
|
||||||
|
if result == "" {
|
||||||
|
t.Errorf("getTarget returned empty string for target=%q, path=%q, query=%q", target, path, query)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify result is a valid URL (for non-empty targets)
|
||||||
|
if target != "" {
|
||||||
|
parsedResult, err := url.Parse(result)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("getTarget produced invalid URL %q: %v", result, err)
|
||||||
|
} else {
|
||||||
|
// For unix sockets, verify the scheme is http
|
||||||
|
if strings.HasPrefix(target, "unix:") && parsedResult.Scheme != "http" {
|
||||||
|
t.Errorf("Unix socket URL should have http scheme, got %q", parsedResult.Scheme)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure no memory corruption by calling multiple times
|
||||||
|
for i := 0; i < 3; i++ {
|
||||||
|
result2 := cache.getTarget(u)
|
||||||
|
if result != result2 {
|
||||||
|
t.Errorf("getTarget not deterministic: %q != %q", result, result2)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// FuzzExtractOGTags tests extractOGTags with various HTML inputs
|
||||||
|
func FuzzExtractOGTags(f *testing.F) {
|
||||||
|
// Seed corpus with interesting HTML cases
|
||||||
|
htmlCases := []string{
|
||||||
|
`<html><head><meta property="og:title" content="Test"></head></html>`,
|
||||||
|
`<meta property="og:title" content="No HTML tags">`,
|
||||||
|
`<html><head>` + strings.Repeat(`<meta property="og:title" content="Many tags">`, 1000) + `</head></html>`,
|
||||||
|
`<html><head><meta property="og:title" content="<script>alert('xss')</script>"></head></html>`,
|
||||||
|
`<html><head><meta property="og:title" content="Line1 Line2"></head></html>`,
|
||||||
|
`<html><head><meta property="og:emoji" content="❤️🎉🎊"></head></html>`,
|
||||||
|
`<html><head><meta property="og:title" content="` + strings.Repeat("A", 10000) + `"></head></html>`,
|
||||||
|
`<html><head><meta property="og:title" content='Single quotes'></head></html>`,
|
||||||
|
`<html><head><meta property=og:title content=no-quotes></head></html>`,
|
||||||
|
`<html><head><meta name="keywords" content="test,keywords"></head></html>`,
|
||||||
|
`<html><head><meta property="unknown:tag" content="Should be ignored"></head></html>`,
|
||||||
|
`<html><head><meta property="` + strings.Repeat("og:", 100) + `title" content="Nested prefixes"></head></html>`,
|
||||||
|
`<html>` + strings.Repeat(`<div>`, 1000) + `<meta property="og:title" content="Deep nesting">` + strings.Repeat(`</div>`, 1000) + `</html>`,
|
||||||
|
`<!DOCTYPE html><html xmlns="http://www.w3.org/1999/xhtml"><head><meta property="og:title" content="With doctype"/></head></html>`,
|
||||||
|
`<html><head><meta property="" content="Empty property"></head></html>`,
|
||||||
|
`<html><head><meta content="Content only"></head></html>`,
|
||||||
|
`<html><head><meta property="og:title"></head></html>`, // No content
|
||||||
|
``, // Empty HTML
|
||||||
|
`<html><head><meta property="og:title" content="Кириллица"></head></html>`,
|
||||||
|
`<html><head><meta property="og:title" content="中文内容"></head></html>`,
|
||||||
|
`<html><head><!--<meta property="og:title" content="Commented out">--></head></html>`,
|
||||||
|
`<html><head><META PROPERTY="OG:TITLE" CONTENT="UPPERCASE"></head></html>`,
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, htmlc := range htmlCases {
|
||||||
|
f.Add(htmlc)
|
||||||
|
}
|
||||||
|
|
||||||
|
f.Fuzz(func(t *testing.T, htmlContent string) {
|
||||||
|
// Skip invalid UTF-8
|
||||||
|
if !utf8.ValidString(htmlContent) {
|
||||||
|
t.Skip()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse HTML - may fail on invalid input
|
||||||
|
doc, err := html.Parse(strings.NewReader(htmlContent))
|
||||||
|
if err != nil {
|
||||||
|
// This is expected for malformed HTML
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := NewOGTagCache("http://example.com", false, 0, false)
|
||||||
|
|
||||||
|
// Should not panic
|
||||||
|
tags := cache.extractOGTags(doc)
|
||||||
|
|
||||||
|
// Validate results
|
||||||
|
for property, content := range tags {
|
||||||
|
// Ensure property is approved
|
||||||
|
approved := false
|
||||||
|
for _, prefix := range cache.approvedPrefixes {
|
||||||
|
if strings.HasPrefix(property, prefix) {
|
||||||
|
approved = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !approved {
|
||||||
|
for _, tag := range cache.approvedTags {
|
||||||
|
if property == tag {
|
||||||
|
approved = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !approved {
|
||||||
|
t.Errorf("Unapproved property %q was extracted", property)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure content is valid string
|
||||||
|
if !utf8.ValidString(content) {
|
||||||
|
t.Errorf("Invalid UTF-8 in content for property %q", property)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test determinism
|
||||||
|
tags2 := cache.extractOGTags(doc)
|
||||||
|
if len(tags) != len(tags2) {
|
||||||
|
t.Errorf("extractOGTags not deterministic: different lengths %d != %d", len(tags), len(tags2))
|
||||||
|
}
|
||||||
|
for k, v := range tags {
|
||||||
|
if tags2[k] != v {
|
||||||
|
t.Errorf("extractOGTags not deterministic: %q=%q != %q=%q", k, v, k, tags2[k])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// FuzzGetTargetRoundTrip tests that getTarget produces valid URLs that can be parsed back
|
||||||
|
func FuzzGetTargetRoundTrip(f *testing.F) {
|
||||||
|
f.Add("http://example.com", "/path/to/resource", "key=value&foo=bar")
|
||||||
|
f.Add("unix:///tmp/socket", "/api/endpoint", "param=test")
|
||||||
|
|
||||||
|
f.Fuzz(func(t *testing.T, target, path, query string) {
|
||||||
|
if !utf8.ValidString(target) || !utf8.ValidString(path) || !utf8.ValidString(query) {
|
||||||
|
t.Skip()
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := NewOGTagCache(target, false, 0, false)
|
||||||
|
u := &url.URL{Path: path, RawQuery: query}
|
||||||
|
|
||||||
|
result := cache.getTarget(u)
|
||||||
|
if result == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse the result back
|
||||||
|
parsed, err := url.Parse(result)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("getTarget produced unparseable URL: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// For non-unix targets, verify path preservation (accounting for encoding)
|
||||||
|
if !strings.HasPrefix(target, "unix:") && target != "" {
|
||||||
|
// The paths should match after normalization
|
||||||
|
expectedPath := u.EscapedPath()
|
||||||
|
if parsed.EscapedPath() != expectedPath {
|
||||||
|
t.Errorf("Path not preserved: want %q, got %q", expectedPath, parsed.EscapedPath())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Query should be preserved exactly
|
||||||
|
if parsed.RawQuery != query {
|
||||||
|
t.Errorf("Query not preserved: want %q, got %q", query, parsed.RawQuery)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// FuzzExtractMetaTagInfo tests the extractMetaTagInfo function directly
|
||||||
|
func FuzzExtractMetaTagInfo(f *testing.F) {
|
||||||
|
// Seed with various attribute combinations
|
||||||
|
f.Add("og:title", "Test Title", "property")
|
||||||
|
f.Add("keywords", "test,keywords", "name")
|
||||||
|
f.Add("og:description", "A description with \"quotes\"", "property")
|
||||||
|
f.Add("twitter:card", "summary", "property")
|
||||||
|
f.Add("unknown:tag", "Should be filtered", "property")
|
||||||
|
f.Add("", "Content without property", "property")
|
||||||
|
f.Add("og:title", "", "property") // Property without content
|
||||||
|
|
||||||
|
f.Fuzz(func(t *testing.T, propertyValue, contentValue, propertyKey string) {
|
||||||
|
if !utf8.ValidString(propertyValue) || !utf8.ValidString(contentValue) || !utf8.ValidString(propertyKey) {
|
||||||
|
t.Skip()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a meta node
|
||||||
|
node := &html.Node{
|
||||||
|
Type: html.ElementNode,
|
||||||
|
Data: "meta",
|
||||||
|
Attr: []html.Attribute{
|
||||||
|
{Key: propertyKey, Val: propertyValue},
|
||||||
|
{Key: "content", Val: contentValue},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := NewOGTagCache("http://example.com", false, 0, false)
|
||||||
|
|
||||||
|
// Should not panic
|
||||||
|
property, content := cache.extractMetaTagInfo(node)
|
||||||
|
|
||||||
|
// If property is returned, it must be approved
|
||||||
|
if property != "" {
|
||||||
|
approved := false
|
||||||
|
for _, prefix := range cache.approvedPrefixes {
|
||||||
|
if strings.HasPrefix(property, prefix) {
|
||||||
|
approved = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !approved {
|
||||||
|
for _, tag := range cache.approvedTags {
|
||||||
|
if property == tag {
|
||||||
|
approved = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !approved {
|
||||||
|
t.Errorf("extractMetaTagInfo returned unapproved property: %q", property)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Content should match input if property is approved
|
||||||
|
if property != "" && content != contentValue {
|
||||||
|
t.Errorf("Content mismatch: want %q, got %q", contentValue, content)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Benchmark comparison for the fuzzed scenarios
|
||||||
|
func BenchmarkFuzzedGetTarget(b *testing.B) {
|
||||||
|
// Test with various challenging inputs found during fuzzing
|
||||||
|
inputs := []struct {
|
||||||
|
name string
|
||||||
|
target string
|
||||||
|
path string
|
||||||
|
query string
|
||||||
|
}{
|
||||||
|
{"Simple", "http://example.com", "/api", "k=v"},
|
||||||
|
{"LongPath", "http://example.com", strings.Repeat("/segment", 50), ""},
|
||||||
|
{"LongQuery", "http://example.com", "/", strings.Repeat("param=value&", 50)},
|
||||||
|
{"Unicode", "http://example.com", "/путь/路径/path", "q=значение"},
|
||||||
|
{"Encoded", "http://example.com", "/path%20with%20spaces", "q=%3Cscript%3E"},
|
||||||
|
{"Unix", "unix:///tmp/socket.sock", "/api/v1/resource", "id=123&format=json"},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, input := range inputs {
|
||||||
|
b.Run(input.name, func(b *testing.B) {
|
||||||
|
cache := NewOGTagCache(input.target, false, 0, false)
|
||||||
|
u := &url.URL{Path: input.path, RawQuery: input.query}
|
||||||
|
|
||||||
|
b.ResetTimer()
|
||||||
|
b.ReportAllocs()
|
||||||
|
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
_ = cache.getTarget(u)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
@ -12,15 +12,12 @@ func (c *OGTagCache) extractOGTags(doc *html.Node) map[string]string {
|
|||||||
|
|
||||||
var traverseNodes func(*html.Node)
|
var traverseNodes func(*html.Node)
|
||||||
traverseNodes = func(n *html.Node) {
|
traverseNodes = func(n *html.Node) {
|
||||||
// isOGMetaTag only checks if it's a <meta> tag.
|
|
||||||
// The actual filtering happens in extractMetaTagInfo now.
|
|
||||||
if isOGMetaTag(n) {
|
if isOGMetaTag(n) {
|
||||||
property, content := c.extractMetaTagInfo(n)
|
property, content := c.extractMetaTagInfo(n)
|
||||||
if property != "" {
|
if property != "" {
|
||||||
ogTags[property] = content
|
ogTags[property] = content
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for child := n.FirstChild; child != nil; child = child.NextSibling {
|
for child := n.FirstChild; child != nil; child = child.NextSibling {
|
||||||
traverseNodes(child)
|
traverseNodes(child)
|
||||||
}
|
}
|
||||||
@ -39,43 +36,40 @@ func isOGMetaTag(n *html.Node) bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// extractMetaTagInfo extracts property and content from a meta tag
|
// extractMetaTagInfo extracts property and content from a meta tag
|
||||||
// *and* checks if the property is approved.
|
|
||||||
// Returns empty property string if the tag is not approved.
|
|
||||||
func (c *OGTagCache) extractMetaTagInfo(n *html.Node) (property, content string) {
|
func (c *OGTagCache) extractMetaTagInfo(n *html.Node) (property, content string) {
|
||||||
var rawProperty string // Store the property found before approval check
|
var propertyKey string
|
||||||
|
|
||||||
|
// Single pass through attributes, using range to avoid bounds checking
|
||||||
for _, attr := range n.Attr {
|
for _, attr := range n.Attr {
|
||||||
if attr.Key == "property" || attr.Key == "name" {
|
switch attr.Key {
|
||||||
rawProperty = attr.Val
|
case "property", "name":
|
||||||
}
|
propertyKey = attr.Val
|
||||||
if attr.Key == "content" {
|
case "content":
|
||||||
content = attr.Val
|
content = attr.Val
|
||||||
}
|
}
|
||||||
}
|
// Early exit if we have both
|
||||||
|
if propertyKey != "" && content != "" {
|
||||||
// Check if the rawProperty is approved
|
|
||||||
isApproved := false
|
|
||||||
for _, prefix := range c.approvedPrefixes {
|
|
||||||
if strings.HasPrefix(rawProperty, prefix) {
|
|
||||||
isApproved = true
|
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Check exact approved tags if not already approved by prefix
|
|
||||||
if !isApproved {
|
if propertyKey == "" {
|
||||||
for _, tag := range c.approvedTags {
|
return "", content
|
||||||
if rawProperty == tag {
|
}
|
||||||
isApproved = true
|
|
||||||
break
|
// Check prefixes first (more common case)
|
||||||
}
|
for _, prefix := range c.approvedPrefixes {
|
||||||
|
if strings.HasPrefix(propertyKey, prefix) {
|
||||||
|
return propertyKey, content
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Only return the property if it's approved
|
// Check exact matches
|
||||||
if isApproved {
|
for _, tag := range c.approvedTags {
|
||||||
property = rawProperty
|
if propertyKey == tag {
|
||||||
|
return propertyKey, content
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Content is returned regardless, but property will be "" if not approved
|
return "", content
|
||||||
return property, content
|
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user