Enhance support of Browsertrix Crawler arguments

2025-09-30 16:41:45 -04:00 · 2025-02-13 15:14:53 +00:00 · 2025-02-13 15:14:53 +00:00 · dc6b5aafb7
commit dc6b5aafb7
parent 4f9085b10e
2 changed files with 462 additions and 62 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Change solution to report partial ZIM to the Zimfarm and other clients (#304)
 - Keep temporary folder when crawler or warc2zim fails, even if not asked for (#468)
 - Add many missing Browsertrix Crawler arguments ; drop default overrides by zimit ; drop `--noMobileDevice` setting (not needed anymore) (#433)
 - Document all Browsertrix Crawler default arguments values (#416)
 ### Fixed
--- a/src/zimit/zimit.py
+++ b/src/zimit/zimit.py
@ -129,6 +129,7 @@ def run(raw_args):
    )
    parser.add_argument("-u", "--url", help="The URL to start crawling from")
    parser.add_argument("--title", help="ZIM title")
    parser.add_argument("--description", help="ZIM description")
    parser.add_argument("--long-description", help="ZIM long description metadata")
@ -138,52 +139,66 @@ def run(raw_args):
        help="If set, read a list of seed urls, one per line, from the specified",
    )
-    parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers")
+    parser.add_argument(
        "-w", "--workers", type=int, help="Number of parallel workers. Default is 1."
    )
    parser.add_argument(
        "--crawlId",
        help="A user provided ID for this crawl or crawl configuration (can also be "
        "set via CRAWL_ID env var, defaults to hostname)",
    )
    parser.add_argument(
        "--waitUntil",
        help="Puppeteer page.goto() condition to wait for before continuing. One of "
        "load, domcontentloaded, networkidle0 or networkidle2, or a "
-        "comma-separated combination of those.",
+        "comma-separated combination of those. Default is load,networkidle2",
        default="load",
    )
    parser.add_argument(
-        "--depth", help="The depth of the crawl for all seeds", type=int, default=-1
+        "--depth",
        help="The depth of the crawl for all seeds. Default is -1.",
        type=int,
    )
    parser.add_argument(
        "--extraHops",
-        help="Number of extra 'hops' to follow, beyond the current scope",
+        help="Number of extra 'hops' to follow, beyond the current scope. "
        "Default is 0.",
        type=int,
    )
-    parser.add_argument("--limit", help="Limit crawl to this number of pages", type=int)
+    parser.add_argument(
        "--limit",
        help="Limit crawl to this number of pages. Default is 0 (no limit).",
        type=int,
    )
    parser.add_argument(
        "--maxPageLimit",
-        help="Maximum pages to crawl, overriding pageLimit if both are set",
+        help="Maximum pages to crawl, overriding pageLimit if both are set. Default is "
        "0 (no limit)",
        type=int,
    )
    parser.add_argument(
        "--timeout",
-        help="Timeout for each page to load (in seconds)",
+        help="Timeout for each page to load (in seconds). Default is 90 secs.",
        type=int,
        default=90,
    )
    parser.add_argument(
        "--scopeType",
        help="A predfined scope of the crawl. For more customization, "
-        "use 'custom' and set scopeIncludeRx regexes",
+        "use 'custom' and set scopeIncludeRx/scopeExcludeRx regexes. Default is custom"
        "if scopeIncludeRx is set, prefix otherwise.",
        choices=["page", "page-spa", "prefix", "host", "domain", "any", "custom"],
    )
    parser.add_argument(
        "--include",
-        help="Regex of page URLs that should be "
+        help="Regex of page URLs that should be included in the crawl (defaults to "
        "included in the crawl (defaults to "
        "the immediate directory of URL)",
    )
@ -192,48 +207,185 @@ def run(raw_args):
        help="Regex of page URLs that should be excluded from the crawl",
    )
    parser.add_argument(
        "--collection",
        help="Collection name to crawl to (replay will be accessible "
        "under this name in pywb preview) instead of crawl-@ts",
    )
    parser.add_argument(
        "--allowHashUrls",
-        help="Allow Hashtag URLs, useful for "
+        help="Allow Hashtag URLs, useful for single-page-application crawling or "
-        "single-page-application crawling or "
+        "when different hashtags load dynamic content",
        "when different hashtags load dynamic "
        "content",
        action="store_true",
    )
    parser.add_argument(
-        "--lang",
+        "--selectLinks",
-        help="if set, sets the language used by the browser, should be ISO 639 "
+        help="One or more selectors for extracting links, in the format "
-        "language[-country] code",
+        "[css selector]->[property to use],[css selector]->@[attribute to use]",
    )
    parser.add_argument(
-        "--zim-lang",
+        "--clickSelector",
-        help="Language metadata of ZIM "
+        help="Selector for elements to click when using the autoclick behavior. Default"
-        "(warc2zim --lang param). ISO-639-3 code. "
+        " is 'a'",
        "Retrieved from homepage if found, fallback to `eng`",
    )
    parser.add_argument(
        "--blockRules",
        help="Additional rules for blocking certain URLs from being loaded, by URL "
        "regex and optionally via text match in an iframe",
    )
    parser.add_argument(
        "--blockMessage",
        help="If specified, when a URL is blocked, a record with this error message is"
        " added instead",
    )
    parser.add_argument(
        "--blockAds",
        help="If set, block advertisements from being loaded (based on Stephen Black's"
        " blocklist). Note that some bad domains are also blocked by zimit"
        " configuration even if this option is not set.",
    )
    parser.add_argument(
        "--adBlockMessage",
        help="If specified, when an ad is blocked, a record with this error message is"
        " added instead",
    )
    parser.add_argument(
        "--collection",
        help="Collection name to crawl to (replay will be accessible "
        "under this name in pywb preview). Default is crawl-@ts.",
    )
    parser.add_argument(
        "--headless",
        help="Run in headless mode, otherwise start xvfb",
        action="store_true",
    )
    parser.add_argument(
        "--driver",
        help="Custom driver for the crawler, if any",
    )
    parser.add_argument(
        "--generateCDX",
        help="If set, generate index (CDXJ) for use with pywb after crawl is done",
        action="store_true",
    )
    parser.add_argument(
        "--combineWARC",
        help="If set, combine the warcs",
        action="store_true",
    )
    parser.add_argument(
        "--rolloverSize",
        help="If set, declare the rollover size. Default is 1000000000.",
        type=int,
    )
    parser.add_argument(
        "--generateWACZ",
        help="If set, generate WACZ on disk",
        action="store_true",
    )
    parser.add_argument(
        "--logging",
        help="Crawler logging configuration",
    )
    parser.add_argument(
        "--logLevel",
        help="Comma-separated list of log levels to include in logs",
    )
    parser.add_argument(
        "--logContext",
        help="Comma-separated list of contexts to include in logs",
        choices=[
            "general",
            "worker",
            "recorder",
            "recorderNetwork",
            "writer",
            "state",
            "redis",
            "storage",
            "text",
            "exclusion",
            "screenshots",
            "screencast",
            "originOverride",
            "healthcheck",
            "browser",
            "blocking",
            "behavior",
            "behaviorScript",
            "jsError",
            "fetch",
            "pageStatus",
            "memoryStatus",
            "crawlStatus",
            "links",
            "sitemap",
            "wacz",
            "replay",
            "proxy",
        ],
    )
    parser.add_argument(
        "--logExcludeContext",
        help="Comma-separated list of contexts to NOT include in logs. Default is "
        "recorderNetwork,jsError,screencast",
        choices=[
            "general",
            "worker",
            "recorder",
            "recorderNetwork",
            "writer",
            "state",
            "redis",
            "storage",
            "text",
            "exclusion",
            "screenshots",
            "screencast",
            "originOverride",
            "healthcheck",
            "browser",
            "blocking",
            "behavior",
            "behaviorScript",
            "jsError",
            "fetch",
            "pageStatus",
            "memoryStatus",
            "crawlStatus",
            "links",
            "sitemap",
            "wacz",
            "replay",
            "proxy",
        ],
    )
    parser.add_argument(
        "--text",
        help="Extract initial (default) or final text to pages.jsonl or WARC resource"
        " record(s)",
    )
    # cwd is not manipulable
    parser.add_argument(
        "--mobileDevice",
        help="Emulate mobile device by name from "
        "https://github.com/puppeteer/puppeteer/blob/"
        "main/packages/puppeteer-core/src/common/Device.ts",
        default="Pixel 2",
    )
    parser.add_argument(
        "--noMobileDevice",
        help="Do not emulate a mobile device (use at your own risk, behavior is"
        "uncertain)",
        action="store_true",
        default=False,
    )
    parser.add_argument(
@ -255,33 +407,108 @@ def run(raw_args):
        "(usually /sitemap.xml)",
    )
    parser.add_argument(
        "--sitemapFromDate",
        help="If set, filter URLs from sitemaps to those greater than or equal to (>=)"
        " provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
    )
    parser.add_argument(
        "--sitemapToDate",
        help="If set, filter URLs from sitemaps to those less than or equal to (<=) "
        "provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
    )
    parser.add_argument(
        "--statsFilename",
        help="If set, output stats as JSON to this file. (Relative filename resolves "
        "to crawl working directory)",
    )
    parser.add_argument(
        "--behaviors",
-        help="Which background behaviors to enable on each page",
+        help="Which background behaviors to enable on each page. Default is autoplay,"
-        default="autoplay,autofetch,siteSpecific",
+        "autofetch,autoscroll,siteSpecific",
    )
    parser.add_argument(
        "--behaviorTimeout",
        help="If >0, timeout (in seconds) for in-page behavior will run on each page. "
-        "If 0, a behavior can run until finish",
+        "If 0, a behavior can run until finish. Default is 90.",
        type=int,
    )
    parser.add_argument(
        "--postLoadDelay",
        help="If >0, amount of time to sleep (in seconds) after page has loaded, before"
        " taking screenshots / getting text / running behaviors. Default is 0.",
        type=int,
        default=90,
    )
    parser.add_argument(
        "--delay",
        help="If >0, amount of time to sleep (in seconds) after behaviors "
-        "before moving on to next page",
+        "before moving on to next page. Default is 0.",
        type=int,
    )
    parser.add_argument(
        "--dedupPolicy",
        help="Deduplication policy. Default is skip",
        choices=["skip", "revisit", "keep"],
    )
    parser.add_argument(
        "--profile",
        help="Path or HTTP(S) URL to tar.gz file which contains the browser profile "
        "directory",
    )
    parser.add_argument(
        "--screenshot",
        help="Screenshot options for crawler. One of view, thumbnail, fullPage, "
        "fullPageFinal or a comma-separated combination of those.",
    )
    parser.add_argument(
        "--screencastPort",
        help="If set to a non-zero value, starts an HTTP server with screencast "
        "accessible on this port.",
        type=int,
    )
    parser.add_argument(
        "--screencastRedis",
        help="If set, will use the state store redis pubsub for screencasting",
        action="store_true",
    )
    parser.add_argument(
        "--warcInfo",
        help="Optional fields added to the warcinfo record in combined WARCs",
    )
    parser.add_argument(
        "--saveState",
        help="If the crawl state should be serialized to the crawls/ directory. "
        "Defaults to 'partial', only saved when crawl is interrupted",
        choices=["never", "partial", "always"],
    )
    parser.add_argument(
        "--saveStateInterval",
        help="If save state is set to 'always', also save state during the crawl at "
        "this interval (in seconds). Default to 300.",
        type=int,
    )
    parser.add_argument(
        "--saveStateHistory",
        help="Number of save states to keep during the duration of a crawl. "
        "Default to 5.",
        type=int,
    )
    size_group = parser.add_mutually_exclusive_group()
    size_group.add_argument(
        "--sizeSoftLimit",
@ -329,7 +556,134 @@ def run(raw_args):
        help="overwrite current crawl data: if set, existing collection directory "
        "will be deleted before crawl is started",
        action="store_true",
-        default=False,
+    )
    parser.add_argument(
        "--waitOnDone",
        help="if set, wait for interrupt signal when finished instead of exiting",
        action="store_true",
    )
    parser.add_argument(
        "--restartsOnError",
        help="if set, assume will be restarted if interrupted, don't run post-crawl "
        "processes on interrupt",
        action="store_true",
    )
    parser.add_argument(
        "--netIdleWait",
        help="If set, wait for network idle after page load and after behaviors are "
        "done (in seconds). if -1 (default), determine based on scope.",
        type=int,
    )
    parser.add_argument(
        "--lang",
        help="if set, sets the language used by the browser, should be ISO 639 "
        "language[-country] code",
    )
    parser.add_argument(
        "--originOverride",
        help="if set, will redirect requests from each origin in key to origin in the "
        "value, eg. --originOverride https://host:port=http://alt-host:alt-port",
    )
    parser.add_argument(
        "--logErrorsToRedis",
        help="If set, write error messages to redis",
        action="store_true",
    )
    parser.add_argument(
        "--writePagesToRedis",
        help="If set, write page objects to redis",
        action="store_true",
    )
    parser.add_argument(
        "--maxPageRetries",
        help="If set, number of times to retry a page that failed to load before page"
        " is considered to have failed. Default is 2.",
        type=int,
    )
    parser.add_argument(
        "--failOnFailedSeed",
        help="If set, crawler will fail with exit code 1 if any seed fails. When "
        "combined with --failOnInvalidStatus, will result in crawl failing with exit "
        "code 1 if any seed has a 4xx/5xx response",
        action="store_true",
    )
    parser.add_argument(
        "--failOnFailedLimit",
        help="If set, save state and exit if number of failed pages exceeds this value",
        action="store_true",
    )
    parser.add_argument(
        "--failOnInvalidStatus",
        help="If set, will treat pages with 4xx or 5xx response as failures. When "
        "combined with --failOnFailedLimit or --failOnFailedSeed may result in crawl "
        "failing due to non-200 responses",
        action="store_true",
    )
    # customBehaviors not included because it has special handling
    # debugAccessRedis not included due to custom redis engine in zimit
    parser.add_argument(
        "--debugAccessBrowser",
        help="if set, allow debugging browser on port 9222 via CDP",
        action="store_true",
    )
    parser.add_argument(
        "--warcPrefix",
        help="prefix for WARC files generated, including WARCs added to WACZ",
    )
    parser.add_argument(
        "--serviceWorker",
        help="service worker handling: disabled, enabled or disabled-if-profile. "
        "Default: disabled.",
    )
    parser.add_argument(
        "--proxyServer",
        help="if set, will use specified proxy server. Takes precedence over any env "
        "var proxy settings",
    )
    parser.add_argument(
        "--dryRun",
        help="If true, no archive data is written to disk, only pages and logs (and "
        "optionally saved state).",
        action="store_true",
    )
    parser.add_argument(
        "--qaSource",
        help="Required for QA mode. Source (WACZ or multi WACZ) for QA",
    )
    parser.add_argument(
        "--qaDebugImageDiff",
        help="if specified, will write crawl.png, replay.png and diff.png for each "
        "page where they're different",
        action="store_true",
    )
    parser.add_argument(
        "--sshProxyPrivateKeyFile",
        help="path to SSH private key for SOCKS5 over SSH proxy connection",
    )
    parser.add_argument(
        "--sshProxyKnownHostsFile",
        help="path to SSH known hosts file for SOCKS5 over SSH proxy connection",
    )
    parser.add_argument(
@ -355,11 +709,6 @@ def run(raw_args):
        help="[warc2zim] Custom CSS file URL/path to inject into all articles",
    )
    parser.add_argument(
        "--statsFilename",
        help="If set, output stats as JSON to this file",
    )
    parser.add_argument(
        "--config",
        help="Path to YAML config file. If set, browsertrix-crawler will use this file"
@ -374,8 +723,10 @@ def run(raw_args):
    )
    parser.add_argument(
-        "--logging",
+        "--zim-lang",
-        help="Crawler logging configuration",
+        help="Language metadata of ZIM "
        "(warc2zim --lang param). ISO-639-3 code. "
        "Retrieved from homepage if found, fallback to `eng`",
    )
    parser.add_argument(
@ -497,10 +848,6 @@ def run(raw_args):
    cmd_args.append("--userAgentSuffix")
    cmd_args.append(user_agent_suffix)
    if not zimit_args.noMobileDevice:
        cmd_args.append("--mobileDevice")
        cmd_args.append(zimit_args.mobileDevice)
    cmd_args.append("--cwd")
    cmd_args.append(str(temp_root_dir))
@ -681,13 +1028,14 @@ def get_cleaned_url(url: str):
 def get_node_cmd_line(args):
-    node_cmd = ["crawl", "--failOnFailedSeed"]
+    node_cmd = ["crawl"]
    for arg in [
        "workers",
        "waitUntil",
        "urlFile",
        "title",
        "description",
        "urlFile",
        "workers",
        "crawlId",
        "waitUntil",
        "depth",
        "extraHops",
        "limit",
@ -698,13 +1046,44 @@ def get_node_cmd_line(args):
        "exclude",
        "collection",
        "allowHashUrls",
-        "lang",
+        "selectLinks",
        "clickSelector",
        "blockRules",
        "blockMessage",
        "blockAds",
        "adBlockMessage",
        "collection",
        "headless",
        "driver",
        "generateCDX",
        "combineWARC",
        "rolloverSize",
        "generateWACZ",
        "logging",
        "logLevel",
        "logContext",
        "logExcludeContext",
        "text",
        "mobileDevice",
        "userAgent",
        # userAgentSuffix (manipulated),
        "useSitemap",
        "sitemapFromDate",
        "sitemapToDate",
        # statsFilename (manipulated),
        "behaviors",
        "behaviorTimeout",
        "postLoadDelay",
        "delay",
        "dedupPolicy",
        "profile",
        "screenshot",
        "screencastPort",
        "screencastRedis",
        "warcInfo",
        "saveState",
        "saveStateInterval",
        "saveStateHistory",
        "sizeSoftLimit",
        "sizeHardLimit",
        "diskUtilization",
@ -712,9 +1091,28 @@ def get_node_cmd_line(args):
        "timeHardLimit",
        "healthCheckPort",
        "overwrite",
-        "config",
+        "waitOnDone",
-        "logging",
+        "restartsOnError",
        "netIdleWait",
        "lang",
        "originOverride",
        "logErrorsToRedis",
        "writePagesToRedis",
        "maxPageRetries",
        "failOnFailedSeed",
        "failOnFailedLimit",
        "failOnInvalidStatus",
        "debugAccessBrowser",
        "warcPrefix",
        "serviceWorker",
        "proxyServer",
        "dryRun",
        "qaSource",
        "qaDebugImageDiff",
        "sshProxyPrivateKeyFile",
        "sshProxyKnownHostsFile",
        "customBehaviors",
        "config",
    ]:
        value = getattr(args, arg)
        if arg == "userAgent":