Enhance support of Browsertrix Crawler arguments

This commit is contained in:
benoit74 2025-02-13 15:14:53 +00:00
parent 4f9085b10e
commit dc6b5aafb7
No known key found for this signature in database
GPG Key ID: B89606434FC7B530
2 changed files with 462 additions and 62 deletions

View File

@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Change solution to report partial ZIM to the Zimfarm and other clients (#304) - Change solution to report partial ZIM to the Zimfarm and other clients (#304)
- Keep temporary folder when crawler or warc2zim fails, even if not asked for (#468) - Keep temporary folder when crawler or warc2zim fails, even if not asked for (#468)
- Add many missing Browsertrix Crawler arguments ; drop default overrides by zimit ; drop `--noMobileDevice` setting (not needed anymore) (#433)
- Document all Browsertrix Crawler default arguments values (#416)
### Fixed ### Fixed

View File

@ -129,6 +129,7 @@ def run(raw_args):
) )
parser.add_argument("-u", "--url", help="The URL to start crawling from") parser.add_argument("-u", "--url", help="The URL to start crawling from")
parser.add_argument("--title", help="ZIM title") parser.add_argument("--title", help="ZIM title")
parser.add_argument("--description", help="ZIM description") parser.add_argument("--description", help="ZIM description")
parser.add_argument("--long-description", help="ZIM long description metadata") parser.add_argument("--long-description", help="ZIM long description metadata")
@ -138,52 +139,66 @@ def run(raw_args):
help="If set, read a list of seed urls, one per line, from the specified", help="If set, read a list of seed urls, one per line, from the specified",
) )
parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers") parser.add_argument(
"-w", "--workers", type=int, help="Number of parallel workers. Default is 1."
)
parser.add_argument(
"--crawlId",
help="A user provided ID for this crawl or crawl configuration (can also be "
"set via CRAWL_ID env var, defaults to hostname)",
)
parser.add_argument( parser.add_argument(
"--waitUntil", "--waitUntil",
help="Puppeteer page.goto() condition to wait for before continuing. One of " help="Puppeteer page.goto() condition to wait for before continuing. One of "
"load, domcontentloaded, networkidle0 or networkidle2, or a " "load, domcontentloaded, networkidle0 or networkidle2, or a "
"comma-separated combination of those.", "comma-separated combination of those. Default is load,networkidle2",
default="load",
) )
parser.add_argument( parser.add_argument(
"--depth", help="The depth of the crawl for all seeds", type=int, default=-1 "--depth",
help="The depth of the crawl for all seeds. Default is -1.",
type=int,
) )
parser.add_argument( parser.add_argument(
"--extraHops", "--extraHops",
help="Number of extra 'hops' to follow, beyond the current scope", help="Number of extra 'hops' to follow, beyond the current scope. "
"Default is 0.",
type=int, type=int,
) )
parser.add_argument("--limit", help="Limit crawl to this number of pages", type=int) parser.add_argument(
"--limit",
help="Limit crawl to this number of pages. Default is 0 (no limit).",
type=int,
)
parser.add_argument( parser.add_argument(
"--maxPageLimit", "--maxPageLimit",
help="Maximum pages to crawl, overriding pageLimit if both are set", help="Maximum pages to crawl, overriding pageLimit if both are set. Default is "
"0 (no limit)",
type=int, type=int,
) )
parser.add_argument( parser.add_argument(
"--timeout", "--timeout",
help="Timeout for each page to load (in seconds)", help="Timeout for each page to load (in seconds). Default is 90 secs.",
type=int, type=int,
default=90,
) )
parser.add_argument( parser.add_argument(
"--scopeType", "--scopeType",
help="A predfined scope of the crawl. For more customization, " help="A predfined scope of the crawl. For more customization, "
"use 'custom' and set scopeIncludeRx regexes", "use 'custom' and set scopeIncludeRx/scopeExcludeRx regexes. Default is custom"
"if scopeIncludeRx is set, prefix otherwise.",
choices=["page", "page-spa", "prefix", "host", "domain", "any", "custom"], choices=["page", "page-spa", "prefix", "host", "domain", "any", "custom"],
) )
parser.add_argument( parser.add_argument(
"--include", "--include",
help="Regex of page URLs that should be " help="Regex of page URLs that should be included in the crawl (defaults to "
"included in the crawl (defaults to "
"the immediate directory of URL)", "the immediate directory of URL)",
) )
@ -192,48 +207,185 @@ def run(raw_args):
help="Regex of page URLs that should be excluded from the crawl", help="Regex of page URLs that should be excluded from the crawl",
) )
parser.add_argument(
"--collection",
help="Collection name to crawl to (replay will be accessible "
"under this name in pywb preview) instead of crawl-@ts",
)
parser.add_argument( parser.add_argument(
"--allowHashUrls", "--allowHashUrls",
help="Allow Hashtag URLs, useful for " help="Allow Hashtag URLs, useful for single-page-application crawling or "
"single-page-application crawling or " "when different hashtags load dynamic content",
"when different hashtags load dynamic "
"content",
action="store_true", action="store_true",
) )
parser.add_argument( parser.add_argument(
"--lang", "--selectLinks",
help="if set, sets the language used by the browser, should be ISO 639 " help="One or more selectors for extracting links, in the format "
"language[-country] code", "[css selector]->[property to use],[css selector]->@[attribute to use]",
) )
parser.add_argument( parser.add_argument(
"--zim-lang", "--clickSelector",
help="Language metadata of ZIM " help="Selector for elements to click when using the autoclick behavior. Default"
"(warc2zim --lang param). ISO-639-3 code. " " is 'a'",
"Retrieved from homepage if found, fallback to `eng`",
) )
parser.add_argument(
"--blockRules",
help="Additional rules for blocking certain URLs from being loaded, by URL "
"regex and optionally via text match in an iframe",
)
parser.add_argument(
"--blockMessage",
help="If specified, when a URL is blocked, a record with this error message is"
" added instead",
)
parser.add_argument(
"--blockAds",
help="If set, block advertisements from being loaded (based on Stephen Black's"
" blocklist). Note that some bad domains are also blocked by zimit"
" configuration even if this option is not set.",
)
parser.add_argument(
"--adBlockMessage",
help="If specified, when an ad is blocked, a record with this error message is"
" added instead",
)
parser.add_argument(
"--collection",
help="Collection name to crawl to (replay will be accessible "
"under this name in pywb preview). Default is crawl-@ts.",
)
parser.add_argument(
"--headless",
help="Run in headless mode, otherwise start xvfb",
action="store_true",
)
parser.add_argument(
"--driver",
help="Custom driver for the crawler, if any",
)
parser.add_argument(
"--generateCDX",
help="If set, generate index (CDXJ) for use with pywb after crawl is done",
action="store_true",
)
parser.add_argument(
"--combineWARC",
help="If set, combine the warcs",
action="store_true",
)
parser.add_argument(
"--rolloverSize",
help="If set, declare the rollover size. Default is 1000000000.",
type=int,
)
parser.add_argument(
"--generateWACZ",
help="If set, generate WACZ on disk",
action="store_true",
)
parser.add_argument(
"--logging",
help="Crawler logging configuration",
)
parser.add_argument(
"--logLevel",
help="Comma-separated list of log levels to include in logs",
)
parser.add_argument(
"--logContext",
help="Comma-separated list of contexts to include in logs",
choices=[
"general",
"worker",
"recorder",
"recorderNetwork",
"writer",
"state",
"redis",
"storage",
"text",
"exclusion",
"screenshots",
"screencast",
"originOverride",
"healthcheck",
"browser",
"blocking",
"behavior",
"behaviorScript",
"jsError",
"fetch",
"pageStatus",
"memoryStatus",
"crawlStatus",
"links",
"sitemap",
"wacz",
"replay",
"proxy",
],
)
parser.add_argument(
"--logExcludeContext",
help="Comma-separated list of contexts to NOT include in logs. Default is "
"recorderNetwork,jsError,screencast",
choices=[
"general",
"worker",
"recorder",
"recorderNetwork",
"writer",
"state",
"redis",
"storage",
"text",
"exclusion",
"screenshots",
"screencast",
"originOverride",
"healthcheck",
"browser",
"blocking",
"behavior",
"behaviorScript",
"jsError",
"fetch",
"pageStatus",
"memoryStatus",
"crawlStatus",
"links",
"sitemap",
"wacz",
"replay",
"proxy",
],
)
parser.add_argument(
"--text",
help="Extract initial (default) or final text to pages.jsonl or WARC resource"
" record(s)",
)
# cwd is not manipulable
parser.add_argument( parser.add_argument(
"--mobileDevice", "--mobileDevice",
help="Emulate mobile device by name from " help="Emulate mobile device by name from "
"https://github.com/puppeteer/puppeteer/blob/" "https://github.com/puppeteer/puppeteer/blob/"
"main/packages/puppeteer-core/src/common/Device.ts", "main/packages/puppeteer-core/src/common/Device.ts",
default="Pixel 2",
)
parser.add_argument(
"--noMobileDevice",
help="Do not emulate a mobile device (use at your own risk, behavior is"
"uncertain)",
action="store_true",
default=False,
) )
parser.add_argument( parser.add_argument(
@ -255,33 +407,108 @@ def run(raw_args):
"(usually /sitemap.xml)", "(usually /sitemap.xml)",
) )
parser.add_argument(
"--sitemapFromDate",
help="If set, filter URLs from sitemaps to those greater than or equal to (>=)"
" provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
)
parser.add_argument(
"--sitemapToDate",
help="If set, filter URLs from sitemaps to those less than or equal to (<=) "
"provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
)
parser.add_argument(
"--statsFilename",
help="If set, output stats as JSON to this file. (Relative filename resolves "
"to crawl working directory)",
)
parser.add_argument( parser.add_argument(
"--behaviors", "--behaviors",
help="Which background behaviors to enable on each page", help="Which background behaviors to enable on each page. Default is autoplay,"
default="autoplay,autofetch,siteSpecific", "autofetch,autoscroll,siteSpecific",
) )
parser.add_argument( parser.add_argument(
"--behaviorTimeout", "--behaviorTimeout",
help="If >0, timeout (in seconds) for in-page behavior will run on each page. " help="If >0, timeout (in seconds) for in-page behavior will run on each page. "
"If 0, a behavior can run until finish", "If 0, a behavior can run until finish. Default is 90.",
type=int,
)
parser.add_argument(
"--postLoadDelay",
help="If >0, amount of time to sleep (in seconds) after page has loaded, before"
" taking screenshots / getting text / running behaviors. Default is 0.",
type=int, type=int,
default=90,
) )
parser.add_argument( parser.add_argument(
"--delay", "--delay",
help="If >0, amount of time to sleep (in seconds) after behaviors " help="If >0, amount of time to sleep (in seconds) after behaviors "
"before moving on to next page", "before moving on to next page. Default is 0.",
type=int, type=int,
) )
parser.add_argument(
"--dedupPolicy",
help="Deduplication policy. Default is skip",
choices=["skip", "revisit", "keep"],
)
parser.add_argument( parser.add_argument(
"--profile", "--profile",
help="Path or HTTP(S) URL to tar.gz file which contains the browser profile " help="Path or HTTP(S) URL to tar.gz file which contains the browser profile "
"directory", "directory",
) )
parser.add_argument(
"--screenshot",
help="Screenshot options for crawler. One of view, thumbnail, fullPage, "
"fullPageFinal or a comma-separated combination of those.",
)
parser.add_argument(
"--screencastPort",
help="If set to a non-zero value, starts an HTTP server with screencast "
"accessible on this port.",
type=int,
)
parser.add_argument(
"--screencastRedis",
help="If set, will use the state store redis pubsub for screencasting",
action="store_true",
)
parser.add_argument(
"--warcInfo",
help="Optional fields added to the warcinfo record in combined WARCs",
)
parser.add_argument(
"--saveState",
help="If the crawl state should be serialized to the crawls/ directory. "
"Defaults to 'partial', only saved when crawl is interrupted",
choices=["never", "partial", "always"],
)
parser.add_argument(
"--saveStateInterval",
help="If save state is set to 'always', also save state during the crawl at "
"this interval (in seconds). Default to 300.",
type=int,
)
parser.add_argument(
"--saveStateHistory",
help="Number of save states to keep during the duration of a crawl. "
"Default to 5.",
type=int,
)
size_group = parser.add_mutually_exclusive_group() size_group = parser.add_mutually_exclusive_group()
size_group.add_argument( size_group.add_argument(
"--sizeSoftLimit", "--sizeSoftLimit",
@ -329,7 +556,134 @@ def run(raw_args):
help="overwrite current crawl data: if set, existing collection directory " help="overwrite current crawl data: if set, existing collection directory "
"will be deleted before crawl is started", "will be deleted before crawl is started",
action="store_true", action="store_true",
default=False, )
parser.add_argument(
"--waitOnDone",
help="if set, wait for interrupt signal when finished instead of exiting",
action="store_true",
)
parser.add_argument(
"--restartsOnError",
help="if set, assume will be restarted if interrupted, don't run post-crawl "
"processes on interrupt",
action="store_true",
)
parser.add_argument(
"--netIdleWait",
help="If set, wait for network idle after page load and after behaviors are "
"done (in seconds). if -1 (default), determine based on scope.",
type=int,
)
parser.add_argument(
"--lang",
help="if set, sets the language used by the browser, should be ISO 639 "
"language[-country] code",
)
parser.add_argument(
"--originOverride",
help="if set, will redirect requests from each origin in key to origin in the "
"value, eg. --originOverride https://host:port=http://alt-host:alt-port",
)
parser.add_argument(
"--logErrorsToRedis",
help="If set, write error messages to redis",
action="store_true",
)
parser.add_argument(
"--writePagesToRedis",
help="If set, write page objects to redis",
action="store_true",
)
parser.add_argument(
"--maxPageRetries",
help="If set, number of times to retry a page that failed to load before page"
" is considered to have failed. Default is 2.",
type=int,
)
parser.add_argument(
"--failOnFailedSeed",
help="If set, crawler will fail with exit code 1 if any seed fails. When "
"combined with --failOnInvalidStatus, will result in crawl failing with exit "
"code 1 if any seed has a 4xx/5xx response",
action="store_true",
)
parser.add_argument(
"--failOnFailedLimit",
help="If set, save state and exit if number of failed pages exceeds this value",
action="store_true",
)
parser.add_argument(
"--failOnInvalidStatus",
help="If set, will treat pages with 4xx or 5xx response as failures. When "
"combined with --failOnFailedLimit or --failOnFailedSeed may result in crawl "
"failing due to non-200 responses",
action="store_true",
)
# customBehaviors not included because it has special handling
# debugAccessRedis not included due to custom redis engine in zimit
parser.add_argument(
"--debugAccessBrowser",
help="if set, allow debugging browser on port 9222 via CDP",
action="store_true",
)
parser.add_argument(
"--warcPrefix",
help="prefix for WARC files generated, including WARCs added to WACZ",
)
parser.add_argument(
"--serviceWorker",
help="service worker handling: disabled, enabled or disabled-if-profile. "
"Default: disabled.",
)
parser.add_argument(
"--proxyServer",
help="if set, will use specified proxy server. Takes precedence over any env "
"var proxy settings",
)
parser.add_argument(
"--dryRun",
help="If true, no archive data is written to disk, only pages and logs (and "
"optionally saved state).",
action="store_true",
)
parser.add_argument(
"--qaSource",
help="Required for QA mode. Source (WACZ or multi WACZ) for QA",
)
parser.add_argument(
"--qaDebugImageDiff",
help="if specified, will write crawl.png, replay.png and diff.png for each "
"page where they're different",
action="store_true",
)
parser.add_argument(
"--sshProxyPrivateKeyFile",
help="path to SSH private key for SOCKS5 over SSH proxy connection",
)
parser.add_argument(
"--sshProxyKnownHostsFile",
help="path to SSH known hosts file for SOCKS5 over SSH proxy connection",
) )
parser.add_argument( parser.add_argument(
@ -355,11 +709,6 @@ def run(raw_args):
help="[warc2zim] Custom CSS file URL/path to inject into all articles", help="[warc2zim] Custom CSS file URL/path to inject into all articles",
) )
parser.add_argument(
"--statsFilename",
help="If set, output stats as JSON to this file",
)
parser.add_argument( parser.add_argument(
"--config", "--config",
help="Path to YAML config file. If set, browsertrix-crawler will use this file" help="Path to YAML config file. If set, browsertrix-crawler will use this file"
@ -374,8 +723,10 @@ def run(raw_args):
) )
parser.add_argument( parser.add_argument(
"--logging", "--zim-lang",
help="Crawler logging configuration", help="Language metadata of ZIM "
"(warc2zim --lang param). ISO-639-3 code. "
"Retrieved from homepage if found, fallback to `eng`",
) )
parser.add_argument( parser.add_argument(
@ -497,10 +848,6 @@ def run(raw_args):
cmd_args.append("--userAgentSuffix") cmd_args.append("--userAgentSuffix")
cmd_args.append(user_agent_suffix) cmd_args.append(user_agent_suffix)
if not zimit_args.noMobileDevice:
cmd_args.append("--mobileDevice")
cmd_args.append(zimit_args.mobileDevice)
cmd_args.append("--cwd") cmd_args.append("--cwd")
cmd_args.append(str(temp_root_dir)) cmd_args.append(str(temp_root_dir))
@ -681,13 +1028,14 @@ def get_cleaned_url(url: str):
def get_node_cmd_line(args): def get_node_cmd_line(args):
node_cmd = ["crawl", "--failOnFailedSeed"] node_cmd = ["crawl"]
for arg in [ for arg in [
"workers",
"waitUntil",
"urlFile",
"title", "title",
"description", "description",
"urlFile",
"workers",
"crawlId",
"waitUntil",
"depth", "depth",
"extraHops", "extraHops",
"limit", "limit",
@ -698,13 +1046,44 @@ def get_node_cmd_line(args):
"exclude", "exclude",
"collection", "collection",
"allowHashUrls", "allowHashUrls",
"lang", "selectLinks",
"clickSelector",
"blockRules",
"blockMessage",
"blockAds",
"adBlockMessage",
"collection",
"headless",
"driver",
"generateCDX",
"combineWARC",
"rolloverSize",
"generateWACZ",
"logging",
"logLevel",
"logContext",
"logExcludeContext",
"text",
"mobileDevice",
"userAgent", "userAgent",
# userAgentSuffix (manipulated),
"useSitemap", "useSitemap",
"sitemapFromDate",
"sitemapToDate",
# statsFilename (manipulated),
"behaviors", "behaviors",
"behaviorTimeout", "behaviorTimeout",
"postLoadDelay",
"delay", "delay",
"dedupPolicy",
"profile", "profile",
"screenshot",
"screencastPort",
"screencastRedis",
"warcInfo",
"saveState",
"saveStateInterval",
"saveStateHistory",
"sizeSoftLimit", "sizeSoftLimit",
"sizeHardLimit", "sizeHardLimit",
"diskUtilization", "diskUtilization",
@ -712,9 +1091,28 @@ def get_node_cmd_line(args):
"timeHardLimit", "timeHardLimit",
"healthCheckPort", "healthCheckPort",
"overwrite", "overwrite",
"config", "waitOnDone",
"logging", "restartsOnError",
"netIdleWait",
"lang",
"originOverride",
"logErrorsToRedis",
"writePagesToRedis",
"maxPageRetries",
"failOnFailedSeed",
"failOnFailedLimit",
"failOnInvalidStatus",
"debugAccessBrowser",
"warcPrefix",
"serviceWorker",
"proxyServer",
"dryRun",
"qaSource",
"qaDebugImageDiff",
"sshProxyPrivateKeyFile",
"sshProxyKnownHostsFile",
"customBehaviors", "customBehaviors",
"config",
]: ]:
value = getattr(args, arg) value = getattr(args, arg)
if arg == "userAgent": if arg == "userAgent":