Enhance support of Browsertrix Crawler arguments

This commit is contained in:
benoit74 2025-02-13 15:14:53 +00:00
parent 4f9085b10e
commit dc6b5aafb7
No known key found for this signature in database
GPG Key ID: B89606434FC7B530
2 changed files with 462 additions and 62 deletions

View File

@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Change solution to report partial ZIM to the Zimfarm and other clients (#304)
- Keep temporary folder when crawler or warc2zim fails, even if not asked for (#468)
- Add many missing Browsertrix Crawler arguments ; drop default overrides by zimit ; drop `--noMobileDevice` setting (not needed anymore) (#433)
- Document all Browsertrix Crawler default arguments values (#416)
### Fixed

View File

@ -129,6 +129,7 @@ def run(raw_args):
)
parser.add_argument("-u", "--url", help="The URL to start crawling from")
parser.add_argument("--title", help="ZIM title")
parser.add_argument("--description", help="ZIM description")
parser.add_argument("--long-description", help="ZIM long description metadata")
@ -138,52 +139,66 @@ def run(raw_args):
help="If set, read a list of seed urls, one per line, from the specified",
)
parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers")
parser.add_argument(
"-w", "--workers", type=int, help="Number of parallel workers. Default is 1."
)
parser.add_argument(
"--crawlId",
help="A user provided ID for this crawl or crawl configuration (can also be "
"set via CRAWL_ID env var, defaults to hostname)",
)
parser.add_argument(
"--waitUntil",
help="Puppeteer page.goto() condition to wait for before continuing. One of "
"load, domcontentloaded, networkidle0 or networkidle2, or a "
"comma-separated combination of those.",
default="load",
"comma-separated combination of those. Default is load,networkidle2",
)
parser.add_argument(
"--depth", help="The depth of the crawl for all seeds", type=int, default=-1
"--depth",
help="The depth of the crawl for all seeds. Default is -1.",
type=int,
)
parser.add_argument(
"--extraHops",
help="Number of extra 'hops' to follow, beyond the current scope",
help="Number of extra 'hops' to follow, beyond the current scope. "
"Default is 0.",
type=int,
)
parser.add_argument("--limit", help="Limit crawl to this number of pages", type=int)
parser.add_argument(
"--limit",
help="Limit crawl to this number of pages. Default is 0 (no limit).",
type=int,
)
parser.add_argument(
"--maxPageLimit",
help="Maximum pages to crawl, overriding pageLimit if both are set",
help="Maximum pages to crawl, overriding pageLimit if both are set. Default is "
"0 (no limit)",
type=int,
)
parser.add_argument(
"--timeout",
help="Timeout for each page to load (in seconds)",
help="Timeout for each page to load (in seconds). Default is 90 secs.",
type=int,
default=90,
)
parser.add_argument(
"--scopeType",
help="A predfined scope of the crawl. For more customization, "
"use 'custom' and set scopeIncludeRx regexes",
"use 'custom' and set scopeIncludeRx/scopeExcludeRx regexes. Default is custom"
"if scopeIncludeRx is set, prefix otherwise.",
choices=["page", "page-spa", "prefix", "host", "domain", "any", "custom"],
)
parser.add_argument(
"--include",
help="Regex of page URLs that should be "
"included in the crawl (defaults to "
help="Regex of page URLs that should be included in the crawl (defaults to "
"the immediate directory of URL)",
)
@ -192,48 +207,185 @@ def run(raw_args):
help="Regex of page URLs that should be excluded from the crawl",
)
parser.add_argument(
"--collection",
help="Collection name to crawl to (replay will be accessible "
"under this name in pywb preview) instead of crawl-@ts",
)
parser.add_argument(
"--allowHashUrls",
help="Allow Hashtag URLs, useful for "
"single-page-application crawling or "
"when different hashtags load dynamic "
"content",
help="Allow Hashtag URLs, useful for single-page-application crawling or "
"when different hashtags load dynamic content",
action="store_true",
)
parser.add_argument(
"--lang",
help="if set, sets the language used by the browser, should be ISO 639 "
"language[-country] code",
"--selectLinks",
help="One or more selectors for extracting links, in the format "
"[css selector]->[property to use],[css selector]->@[attribute to use]",
)
parser.add_argument(
"--zim-lang",
help="Language metadata of ZIM "
"(warc2zim --lang param). ISO-639-3 code. "
"Retrieved from homepage if found, fallback to `eng`",
"--clickSelector",
help="Selector for elements to click when using the autoclick behavior. Default"
" is 'a'",
)
parser.add_argument(
"--blockRules",
help="Additional rules for blocking certain URLs from being loaded, by URL "
"regex and optionally via text match in an iframe",
)
parser.add_argument(
"--blockMessage",
help="If specified, when a URL is blocked, a record with this error message is"
" added instead",
)
parser.add_argument(
"--blockAds",
help="If set, block advertisements from being loaded (based on Stephen Black's"
" blocklist). Note that some bad domains are also blocked by zimit"
" configuration even if this option is not set.",
)
parser.add_argument(
"--adBlockMessage",
help="If specified, when an ad is blocked, a record with this error message is"
" added instead",
)
parser.add_argument(
"--collection",
help="Collection name to crawl to (replay will be accessible "
"under this name in pywb preview). Default is crawl-@ts.",
)
parser.add_argument(
"--headless",
help="Run in headless mode, otherwise start xvfb",
action="store_true",
)
parser.add_argument(
"--driver",
help="Custom driver for the crawler, if any",
)
parser.add_argument(
"--generateCDX",
help="If set, generate index (CDXJ) for use with pywb after crawl is done",
action="store_true",
)
parser.add_argument(
"--combineWARC",
help="If set, combine the warcs",
action="store_true",
)
parser.add_argument(
"--rolloverSize",
help="If set, declare the rollover size. Default is 1000000000.",
type=int,
)
parser.add_argument(
"--generateWACZ",
help="If set, generate WACZ on disk",
action="store_true",
)
parser.add_argument(
"--logging",
help="Crawler logging configuration",
)
parser.add_argument(
"--logLevel",
help="Comma-separated list of log levels to include in logs",
)
parser.add_argument(
"--logContext",
help="Comma-separated list of contexts to include in logs",
choices=[
"general",
"worker",
"recorder",
"recorderNetwork",
"writer",
"state",
"redis",
"storage",
"text",
"exclusion",
"screenshots",
"screencast",
"originOverride",
"healthcheck",
"browser",
"blocking",
"behavior",
"behaviorScript",
"jsError",
"fetch",
"pageStatus",
"memoryStatus",
"crawlStatus",
"links",
"sitemap",
"wacz",
"replay",
"proxy",
],
)
parser.add_argument(
"--logExcludeContext",
help="Comma-separated list of contexts to NOT include in logs. Default is "
"recorderNetwork,jsError,screencast",
choices=[
"general",
"worker",
"recorder",
"recorderNetwork",
"writer",
"state",
"redis",
"storage",
"text",
"exclusion",
"screenshots",
"screencast",
"originOverride",
"healthcheck",
"browser",
"blocking",
"behavior",
"behaviorScript",
"jsError",
"fetch",
"pageStatus",
"memoryStatus",
"crawlStatus",
"links",
"sitemap",
"wacz",
"replay",
"proxy",
],
)
parser.add_argument(
"--text",
help="Extract initial (default) or final text to pages.jsonl or WARC resource"
" record(s)",
)
# cwd is not manipulable
parser.add_argument(
"--mobileDevice",
help="Emulate mobile device by name from "
"https://github.com/puppeteer/puppeteer/blob/"
"main/packages/puppeteer-core/src/common/Device.ts",
default="Pixel 2",
)
parser.add_argument(
"--noMobileDevice",
help="Do not emulate a mobile device (use at your own risk, behavior is"
"uncertain)",
action="store_true",
default=False,
)
parser.add_argument(
@ -255,33 +407,108 @@ def run(raw_args):
"(usually /sitemap.xml)",
)
parser.add_argument(
"--sitemapFromDate",
help="If set, filter URLs from sitemaps to those greater than or equal to (>=)"
" provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
)
parser.add_argument(
"--sitemapToDate",
help="If set, filter URLs from sitemaps to those less than or equal to (<=) "
"provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
)
parser.add_argument(
"--statsFilename",
help="If set, output stats as JSON to this file. (Relative filename resolves "
"to crawl working directory)",
)
parser.add_argument(
"--behaviors",
help="Which background behaviors to enable on each page",
default="autoplay,autofetch,siteSpecific",
help="Which background behaviors to enable on each page. Default is autoplay,"
"autofetch,autoscroll,siteSpecific",
)
parser.add_argument(
"--behaviorTimeout",
help="If >0, timeout (in seconds) for in-page behavior will run on each page. "
"If 0, a behavior can run until finish",
"If 0, a behavior can run until finish. Default is 90.",
type=int,
)
parser.add_argument(
"--postLoadDelay",
help="If >0, amount of time to sleep (in seconds) after page has loaded, before"
" taking screenshots / getting text / running behaviors. Default is 0.",
type=int,
default=90,
)
parser.add_argument(
"--delay",
help="If >0, amount of time to sleep (in seconds) after behaviors "
"before moving on to next page",
"before moving on to next page. Default is 0.",
type=int,
)
parser.add_argument(
"--dedupPolicy",
help="Deduplication policy. Default is skip",
choices=["skip", "revisit", "keep"],
)
parser.add_argument(
"--profile",
help="Path or HTTP(S) URL to tar.gz file which contains the browser profile "
"directory",
)
parser.add_argument(
"--screenshot",
help="Screenshot options for crawler. One of view, thumbnail, fullPage, "
"fullPageFinal or a comma-separated combination of those.",
)
parser.add_argument(
"--screencastPort",
help="If set to a non-zero value, starts an HTTP server with screencast "
"accessible on this port.",
type=int,
)
parser.add_argument(
"--screencastRedis",
help="If set, will use the state store redis pubsub for screencasting",
action="store_true",
)
parser.add_argument(
"--warcInfo",
help="Optional fields added to the warcinfo record in combined WARCs",
)
parser.add_argument(
"--saveState",
help="If the crawl state should be serialized to the crawls/ directory. "
"Defaults to 'partial', only saved when crawl is interrupted",
choices=["never", "partial", "always"],
)
parser.add_argument(
"--saveStateInterval",
help="If save state is set to 'always', also save state during the crawl at "
"this interval (in seconds). Default to 300.",
type=int,
)
parser.add_argument(
"--saveStateHistory",
help="Number of save states to keep during the duration of a crawl. "
"Default to 5.",
type=int,
)
size_group = parser.add_mutually_exclusive_group()
size_group.add_argument(
"--sizeSoftLimit",
@ -329,7 +556,134 @@ def run(raw_args):
help="overwrite current crawl data: if set, existing collection directory "
"will be deleted before crawl is started",
action="store_true",
default=False,
)
parser.add_argument(
"--waitOnDone",
help="if set, wait for interrupt signal when finished instead of exiting",
action="store_true",
)
parser.add_argument(
"--restartsOnError",
help="if set, assume will be restarted if interrupted, don't run post-crawl "
"processes on interrupt",
action="store_true",
)
parser.add_argument(
"--netIdleWait",
help="If set, wait for network idle after page load and after behaviors are "
"done (in seconds). if -1 (default), determine based on scope.",
type=int,
)
parser.add_argument(
"--lang",
help="if set, sets the language used by the browser, should be ISO 639 "
"language[-country] code",
)
parser.add_argument(
"--originOverride",
help="if set, will redirect requests from each origin in key to origin in the "
"value, eg. --originOverride https://host:port=http://alt-host:alt-port",
)
parser.add_argument(
"--logErrorsToRedis",
help="If set, write error messages to redis",
action="store_true",
)
parser.add_argument(
"--writePagesToRedis",
help="If set, write page objects to redis",
action="store_true",
)
parser.add_argument(
"--maxPageRetries",
help="If set, number of times to retry a page that failed to load before page"
" is considered to have failed. Default is 2.",
type=int,
)
parser.add_argument(
"--failOnFailedSeed",
help="If set, crawler will fail with exit code 1 if any seed fails. When "
"combined with --failOnInvalidStatus, will result in crawl failing with exit "
"code 1 if any seed has a 4xx/5xx response",
action="store_true",
)
parser.add_argument(
"--failOnFailedLimit",
help="If set, save state and exit if number of failed pages exceeds this value",
action="store_true",
)
parser.add_argument(
"--failOnInvalidStatus",
help="If set, will treat pages with 4xx or 5xx response as failures. When "
"combined with --failOnFailedLimit or --failOnFailedSeed may result in crawl "
"failing due to non-200 responses",
action="store_true",
)
# customBehaviors not included because it has special handling
# debugAccessRedis not included due to custom redis engine in zimit
parser.add_argument(
"--debugAccessBrowser",
help="if set, allow debugging browser on port 9222 via CDP",
action="store_true",
)
parser.add_argument(
"--warcPrefix",
help="prefix for WARC files generated, including WARCs added to WACZ",
)
parser.add_argument(
"--serviceWorker",
help="service worker handling: disabled, enabled or disabled-if-profile. "
"Default: disabled.",
)
parser.add_argument(
"--proxyServer",
help="if set, will use specified proxy server. Takes precedence over any env "
"var proxy settings",
)
parser.add_argument(
"--dryRun",
help="If true, no archive data is written to disk, only pages and logs (and "
"optionally saved state).",
action="store_true",
)
parser.add_argument(
"--qaSource",
help="Required for QA mode. Source (WACZ or multi WACZ) for QA",
)
parser.add_argument(
"--qaDebugImageDiff",
help="if specified, will write crawl.png, replay.png and diff.png for each "
"page where they're different",
action="store_true",
)
parser.add_argument(
"--sshProxyPrivateKeyFile",
help="path to SSH private key for SOCKS5 over SSH proxy connection",
)
parser.add_argument(
"--sshProxyKnownHostsFile",
help="path to SSH known hosts file for SOCKS5 over SSH proxy connection",
)
parser.add_argument(
@ -355,11 +709,6 @@ def run(raw_args):
help="[warc2zim] Custom CSS file URL/path to inject into all articles",
)
parser.add_argument(
"--statsFilename",
help="If set, output stats as JSON to this file",
)
parser.add_argument(
"--config",
help="Path to YAML config file. If set, browsertrix-crawler will use this file"
@ -374,8 +723,10 @@ def run(raw_args):
)
parser.add_argument(
"--logging",
help="Crawler logging configuration",
"--zim-lang",
help="Language metadata of ZIM "
"(warc2zim --lang param). ISO-639-3 code. "
"Retrieved from homepage if found, fallback to `eng`",
)
parser.add_argument(
@ -497,10 +848,6 @@ def run(raw_args):
cmd_args.append("--userAgentSuffix")
cmd_args.append(user_agent_suffix)
if not zimit_args.noMobileDevice:
cmd_args.append("--mobileDevice")
cmd_args.append(zimit_args.mobileDevice)
cmd_args.append("--cwd")
cmd_args.append(str(temp_root_dir))
@ -681,13 +1028,14 @@ def get_cleaned_url(url: str):
def get_node_cmd_line(args):
node_cmd = ["crawl", "--failOnFailedSeed"]
node_cmd = ["crawl"]
for arg in [
"workers",
"waitUntil",
"urlFile",
"title",
"description",
"urlFile",
"workers",
"crawlId",
"waitUntil",
"depth",
"extraHops",
"limit",
@ -698,13 +1046,44 @@ def get_node_cmd_line(args):
"exclude",
"collection",
"allowHashUrls",
"lang",
"selectLinks",
"clickSelector",
"blockRules",
"blockMessage",
"blockAds",
"adBlockMessage",
"collection",
"headless",
"driver",
"generateCDX",
"combineWARC",
"rolloverSize",
"generateWACZ",
"logging",
"logLevel",
"logContext",
"logExcludeContext",
"text",
"mobileDevice",
"userAgent",
# userAgentSuffix (manipulated),
"useSitemap",
"sitemapFromDate",
"sitemapToDate",
# statsFilename (manipulated),
"behaviors",
"behaviorTimeout",
"postLoadDelay",
"delay",
"dedupPolicy",
"profile",
"screenshot",
"screencastPort",
"screencastRedis",
"warcInfo",
"saveState",
"saveStateInterval",
"saveStateHistory",
"sizeSoftLimit",
"sizeHardLimit",
"diskUtilization",
@ -712,9 +1091,28 @@ def get_node_cmd_line(args):
"timeHardLimit",
"healthCheckPort",
"overwrite",
"config",
"logging",
"waitOnDone",
"restartsOnError",
"netIdleWait",
"lang",
"originOverride",
"logErrorsToRedis",
"writePagesToRedis",
"maxPageRetries",
"failOnFailedSeed",
"failOnFailedLimit",
"failOnInvalidStatus",
"debugAccessBrowser",
"warcPrefix",
"serviceWorker",
"proxyServer",
"dryRun",
"qaSource",
"qaDebugImageDiff",
"sshProxyPrivateKeyFile",
"sshProxyKnownHostsFile",
"customBehaviors",
"config",
]:
value = getattr(args, arg)
if arg == "userAgent":