anubis/internal/test/playwright_test.go
Jason Cameron 77436207e6
feat: Add Open Graph tag support (#195)
* feat: Add Open Graph tag support (og-tags)

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* Fix: Prevent nil pointer dereference in test (og-tags)

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* feat!: Implement Open Graph tag caching and passthrough functionality (WIP)

I'm going to sleep. currently tags are passed to renderIndex.

see https://github.com/TecharoHQ/anubis/issues/131

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* feat: Add configuration for air tool with build and logger settings

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* feat: Move OG tags to base template (og-tags)

Moves the Open Graph (OG) tags from the index template to
the base template. This allows OG tags to be set on any
page, not just the index.  Also adds a
BaseWithOGTags function to the web package to allow
passing OG tags to the base template.  Removes the
ogTags parameter from the Index function and template.

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* Delete CHANGELOG.md

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* feat: Add language attribute to HTML tag in template

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* fix(tests):  Fix nil pointer ref

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* feat(og-tags): Add timeout to http client (og-tags)

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* style: fix line endings & indentation

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* style: add inspection comment for GoBoolExpressions in UnchangingCache

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* feat(og-tags): Implement Open Graph tag fetching and caching

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* fix(og-tags): Simplify Open Graph tag extraction logic

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* fix(og-tags): Add nil check in isOGMetaTag and enhance test cases

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* feat(og-tags): Add approved tags and prefixes for Open Graph extraction

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* test(og-tags): Update tests with approved tags and improve clarity

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* chore: Add changelog notes

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* fix: Improve stability of the target fetcher?

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* fix: Update template error handling and improve Open Graph tag integration

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* style: format files and remove deubg logs

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* feat: Credit CELPHASE for mascot design (og-tags)

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* feat: Credit CELPHASE for mascot design (og-tags)

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* feat: Allow twitter prefixed OG tags by default

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* chore: replace /tmp with /var

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* Update docs/docs/CHANGELOG.md

Co-authored-by: Xe Iaso <me@xeiaso.net>
Signed-off-by: Jason Cameron <jasoncameron.all@gmail.com>

* Update docs/docs/admin/configuration/open-graph.mdx

Co-authored-by: Xe Iaso <me@xeiaso.net>
Signed-off-by: Jason Cameron <jasoncameron.all@gmail.com>

* chore: add fediverse to default prefixes (#og-tags)

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* feat(og-tags): Remove og-query-distinct flag

This commit removes the `og-query-distinct` flag and
associated logic.  URLs with different query parameters
will now always be treated as the same cache key for Open
Graph tags.  This simplifies the caching logic and
improves performance.

Additionally, the http client used for fetching OG tags
is now a member of the OGTagCache struct, rather than a
global variable. This improves testability and allows
for more flexible configuration in the future.

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* Update docs/docs/admin/configuration/open-graph.mdx

Co-authored-by: Xe Iaso <me@xeiaso.net>
Signed-off-by: Jason Cameron <jasoncameron.all@gmail.com>

* docs: remove og tags references

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* refactor: rename url > u to not overlap package name

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* Update internal/ogtags/cache.go

Co-authored-by: Xe Iaso <me@xeiaso.net>
Signed-off-by: Jason Cameron <jasoncameron.all@gmail.com>

* Update internal/ogtags/cache.go

Co-authored-by: Xe Iaso <me@xeiaso.net>
Signed-off-by: Jason Cameron <jasoncameron.all@gmail.com>

* fix(tests): Don't use network when network access is disabled

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* Fix: Handle nil URL in GetOGTags (og-tags)

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* chore: sort installation docs alphabetically

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* fix(tests): validate that no duplicate requests are made

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* style(tests): remove unused ok var

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* docs: convert to table fmt

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* feat(og-tags): Enhance OG tag fetching and caching

Adds additional approved OG tags (`keywords`, `author`), improves

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* chore: update generated templ's after format

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* fix(tests): update integration_test.go to reflect the new behavior of fetchHTMLDocument

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* Revert "data/botPolicies: allow iMessage scraper by default (#178)"

This reverts commit 21a9d777

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* Fix: Simplify ogTags access in cache test.

Didn't know this was possible! wow!

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* Fix: Handle request timeouts when fetching OG tags (#og-tags)

Cache a nil result for half the TTL to avoid repeatedly
requesting a timed-out URL.

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* Fix: make OG tags passthrough option function.

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* Fix: Handle timeouts and non-200 responses when fetching OG tags (og-tags)

- Cache empty results for timeouts and non-200 status codes
  to avoid spamming the server.
- Use a non-nil empty map to represent empty results in the
  cache, as nil would be a cache miss.

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* feat(og-tags): switch to http.MaxBytesReader

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* chore(og-tags): add noindex, nofollow meta tag and update error line numbers

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

---------

Signed-off-by: Jason Cameron <git@jasoncameron.dev>
Signed-off-by: Jason Cameron <jasoncameron.all@gmail.com>
Co-authored-by: Xe Iaso <me@xeiaso.net>
2025-04-06 20:02:12 -04:00

457 lines
11 KiB
Go

//go:build !windows
// Integration tests for Anubis, using Playwright.
//
// These tests require an already running Anubis and Playwright server.
//
// Anubis must be configured to redirect to the server started by the test suite.
// The bind address and the Anubis server can be specified using the flags `-bind` and `-anubis` respectively.
//
// Playwright must be started in server mode using `npx playwright@1.50.1 run-server --port 3000`.
// The version must match the minor used by the playwright-go package.
//
// On unsupported systems you may be able to use a container instead: https://playwright.dev/docs/docker#remote-connection
//
// In that case you may need to set the `-playwright` flag to the container's URL, and specify the `--host` the run-server command listens on.
package test
import (
"flag"
"fmt"
"net"
"net/http"
"net/http/httptest"
"net/url"
"os"
"os/exec"
"strconv"
"testing"
"time"
"github.com/TecharoHQ/anubis"
libanubis "github.com/TecharoHQ/anubis/lib"
"github.com/playwright-community/playwright-go"
)
var (
playwrightPort = flag.Int("playwright-port", 9001, "Playwright port")
playwrightServer = flag.String("playwright", "ws://localhost:9001", "Playwright server URL")
playwrightMaxTime = flag.Duration("playwright-max-time", 5*time.Second, "maximum time for Playwright requests")
playwrightMaxHardTime = flag.Duration("playwright-max-hard-time", 5*time.Minute, "maximum time for hard Playwright requests")
testCases = []testCase{
{
name: "firefox",
action: actionChallenge,
realIP: placeholderIP,
userAgent: "Mozilla/5.0 (X11; Linux x86_64; rv:136.0) Gecko/20100101 Firefox/136.0",
},
{
name: "headlessChrome",
action: actionDeny,
realIP: placeholderIP,
userAgent: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/120.0.6099.28 Safari/537.36",
},
{
name: "kagiBadIP",
action: actionChallenge,
isHard: true,
realIP: placeholderIP,
userAgent: "Mozilla/5.0 (compatible; Kagibot/1.0; +https://kagi.com/bot)",
},
{
name: "kagiGoodIP",
action: actionAllow,
realIP: "216.18.205.234",
userAgent: "Mozilla/5.0 (compatible; Kagibot/1.0; +https://kagi.com/bot)",
},
{
name: "unknownAgent",
action: actionAllow,
realIP: placeholderIP,
userAgent: "AnubisTest/0",
},
}
)
const (
actionAllow action = "ALLOW"
actionDeny action = "DENY"
actionChallenge action = "CHALLENGE"
placeholderIP = "fd11:5ee:bad:c0de::"
playwrightVersion = "1.50.1"
)
type action string
type testCase struct {
name string
action action
isHard bool
realIP, userAgent string
}
func doesNPXExist(t *testing.T) {
t.Helper()
if _, err := exec.LookPath("npx"); err != nil {
t.Skipf("npx not found in PATH, skipping integration smoke testing: %v", err)
}
}
func run(t *testing.T, command string) string {
t.Helper()
shPath, err := exec.LookPath("sh")
if err != nil {
t.Fatalf("[unexpected] %v", err)
}
t.Logf("running command: %s", command)
cmd := exec.Command(shPath, "-c", command)
cmd.Stdin = nil
cmd.Stderr = os.Stderr
output, err := cmd.Output()
if err != nil {
t.Fatalf("can't run command: %v", err)
}
return string(output)
}
func daemonize(t *testing.T, command string) {
t.Helper()
shPath, err := exec.LookPath("sh")
if err != nil {
t.Fatalf("[unexpected] %v", err)
}
t.Logf("daemonizing command: %s", command)
cmd := exec.Command(shPath, "-c", command)
cmd.Stdin = nil
cmd.Stderr = os.Stderr
cmd.Stdout = os.Stdout
if err := cmd.Start(); err != nil {
t.Fatalf("can't daemonize command: %v", err)
}
t.Cleanup(func() {
cmd.Process.Kill()
})
}
func startPlaywright(t *testing.T) {
t.Helper()
if os.Getenv("CI") == "true" {
run(t, fmt.Sprintf("npx --yes playwright@%s install --with-deps", playwrightVersion))
} else {
run(t, fmt.Sprintf("npx --yes playwright@%s install", playwrightVersion))
}
daemonize(t, fmt.Sprintf("npx --yes playwright@%s run-server --port %d", playwrightVersion, *playwrightPort))
for {
if _, err := http.Get(fmt.Sprintf("http://localhost:%d", *playwrightPort)); err != nil {
time.Sleep(500 * time.Millisecond)
continue
}
break
}
//nosleep:bypass XXX(Xe): Playwright doesn't have a good way to signal readiness. This is a HACK that will just let the tests pass.
time.Sleep(2 * time.Second)
}
func TestPlaywrightBrowser(t *testing.T) {
if os.Getenv("DONT_USE_NETWORK") != "" {
t.Skip("test requires network egress")
return
}
doesNPXExist(t)
startPlaywright(t)
pw := setupPlaywright(t)
anubisURL := spawnAnubis(t)
browsers := []playwright.BrowserType{pw.Chromium, pw.Firefox, pw.WebKit}
for _, typ := range browsers {
t.Run(typ.Name()+"/warmup", func(t *testing.T) {
browser, err := typ.Connect(buildBrowserConnect(typ.Name()), playwright.BrowserTypeConnectOptions{
ExposeNetwork: playwright.String("<loopback>"),
})
if err != nil {
t.Fatalf("could not connect to remote browser: %v", err)
}
defer browser.Close()
ctx, err := browser.NewContext(playwright.BrowserNewContextOptions{
AcceptDownloads: playwright.Bool(false),
ExtraHttpHeaders: map[string]string{
"X-Real-Ip": "127.0.0.1",
},
UserAgent: playwright.String("Sephiroth"),
})
if err != nil {
t.Fatalf("could not create context: %v", err)
}
defer ctx.Close()
page, err := ctx.NewPage()
if err != nil {
t.Fatalf("could not create page: %v", err)
}
defer page.Close()
timeout := 2.0
page.Goto(anubisURL, playwright.PageGotoOptions{
Timeout: &timeout,
})
})
for _, tc := range testCases {
name := fmt.Sprintf("%s/%s", typ.Name(), tc.name)
t.Run(name, func(t *testing.T) {
_, hasDeadline := t.Deadline()
if tc.isHard && hasDeadline {
t.Skip("skipping hard challenge with deadline")
}
var performedAction action
var err error
for i := 0; i < 5; i++ {
performedAction, err = executeTestCase(t, tc, typ, anubisURL)
if performedAction == tc.action {
break
}
time.Sleep(time.Duration(i+1) * 250 * time.Millisecond)
}
if performedAction != tc.action {
t.Errorf("unexpected test result, expected %s, got %s", tc.action, performedAction)
}
if err != nil {
t.Fatalf("test error: %v", err)
}
})
}
}
}
func buildBrowserConnect(name string) string {
u, _ := url.Parse(*playwrightServer)
q := u.Query()
q.Set("browser", name)
u.RawQuery = q.Encode()
return u.String()
}
func executeTestCase(t *testing.T, tc testCase, typ playwright.BrowserType, anubisURL string) (action, error) {
deadline, _ := t.Deadline()
browser, err := typ.Connect(buildBrowserConnect(typ.Name()), playwright.BrowserTypeConnectOptions{
ExposeNetwork: playwright.String("<loopback>"),
})
if err != nil {
return "", fmt.Errorf("could not connect to remote browser: %w", err)
}
defer browser.Close()
ctx, err := browser.NewContext(playwright.BrowserNewContextOptions{
AcceptDownloads: playwright.Bool(false),
ExtraHttpHeaders: map[string]string{
"X-Real-Ip": tc.realIP,
},
UserAgent: playwright.String(tc.userAgent),
})
if err != nil {
return "", fmt.Errorf("could not create context: %w", err)
}
defer ctx.Close()
page, err := ctx.NewPage()
if err != nil {
return "", fmt.Errorf("could not create page: %w", err)
}
defer page.Close()
// Attempt challenge.
start := time.Now()
_, err = page.Goto(anubisURL, playwright.PageGotoOptions{
Timeout: pwTimeout(tc, deadline),
})
if err != nil {
return "", pwFail(t, page, "could not navigate to test server: %v", err)
}
hadChallenge := false
switch tc.action {
case actionChallenge:
// FIXME: This could race if challenge is completed too quickly.
checkImage(t, tc, deadline, page, "#image[src*=pensive], #image[src*=happy]")
hadChallenge = true
case actionDeny:
checkImage(t, tc, deadline, page, "#image[src*=sad]")
return actionDeny, nil
}
// Ensure protected resource was provided.
res, err := page.Locator("#anubis-test").TextContent(playwright.LocatorTextContentOptions{
Timeout: pwTimeout(tc, deadline),
})
end := time.Now()
if err != nil {
pwFail(t, page, "could not get text content: %v", err)
}
var tm int64
if _, err := fmt.Sscanf(res, "%d", &tm); err != nil {
pwFail(t, page, "unexpected output: %s", res)
}
if tm < start.Unix() || end.Unix() < tm {
pwFail(t, page, "unexpected timestamp in output: %d not in range %d..%d", tm, start.Unix(), end.Unix())
}
if hadChallenge {
return actionChallenge, nil
} else {
return actionAllow, nil
}
}
func checkImage(t *testing.T, tc testCase, deadline time.Time, page playwright.Page, locator string) {
image := page.Locator(locator)
err := image.WaitFor(playwright.LocatorWaitForOptions{
Timeout: pwTimeout(tc, deadline),
})
if err != nil {
pwFail(t, page, "could not wait for result: %v", err)
}
failIsVisible, err := image.IsVisible()
if err != nil {
pwFail(t, page, "could not check result image: %v", err)
}
if !failIsVisible {
pwFail(t, page, "expected result image not visible")
}
}
func pwFail(t *testing.T, page playwright.Page, format string, args ...any) error {
t.Helper()
saveScreenshot(t, page)
return fmt.Errorf(format, args...)
}
func pwTimeout(tc testCase, deadline time.Time) *float64 {
max := *playwrightMaxTime
if tc.isHard {
max = *playwrightMaxHardTime
}
d := time.Until(deadline)
if d <= 0 || d > max {
return playwright.Float(float64(max.Milliseconds()))
}
return playwright.Float(float64(d.Milliseconds()))
}
func saveScreenshot(t *testing.T, page playwright.Page) {
t.Helper()
data, err := page.Screenshot()
if err != nil {
t.Logf("could not take screenshot: %v", err)
return
}
f, err := os.CreateTemp("", "anubis-test-fail-*.png")
if err != nil {
t.Logf("could not create temporary file: %v", err)
return
}
defer f.Close()
_, err = f.Write(data)
if err != nil {
t.Logf("could not write screenshot: %v", err)
return
}
t.Logf("screenshot saved to %s", f.Name())
}
func setupPlaywright(t *testing.T) *playwright.Playwright {
err := playwright.Install(&playwright.RunOptions{
SkipInstallBrowsers: true,
})
if err != nil {
t.Fatalf("could not install Playwright: %v", err)
}
pw, err := playwright.Run()
if err != nil {
t.Fatalf("could not start Playwright: %v", err)
}
return pw
}
func spawnAnubis(t *testing.T) string {
t.Helper()
h := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Add("Content-Type", "text/html")
fmt.Fprintf(w, "<html><body><span id=anubis-test>%d</span></body></html>", time.Now().Unix())
})
policy, err := libanubis.LoadPoliciesOrDefault("", anubis.DefaultDifficulty)
if err != nil {
t.Fatal(err)
}
listener, err := net.Listen("tcp", ":0")
if err != nil {
t.Fatalf("can't listen on random port: %v", err)
}
addr := listener.Addr().(*net.TCPAddr)
host := "localhost"
port := strconv.Itoa(addr.Port)
s, err := libanubis.New(libanubis.Options{
Next: h,
Policy: policy,
ServeRobotsTXT: true,
Target: "http://" + host + ":" + port,
})
if err != nil {
t.Fatalf("can't construct libanubis.Server: %v", err)
}
ts := &httptest.Server{
Listener: listener,
Config: &http.Server{Handler: s},
}
ts.Start()
t.Log(ts.URL)
t.Cleanup(func() {
ts.Close()
})
return ts.URL
}