diff --git a/CHANGELOG.md b/CHANGELOG.md index 308058b..3d20f59 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Change solution to report partial ZIM to the Zimfarm and other clients (#304) - Keep temporary folder when crawler or warc2zim fails, even if not asked for (#468) +- Add many missing Browsertrix Crawler arguments ; drop default overrides by zimit ; drop `--noMobileDevice` setting (not needed anymore) (#433) +- Document all Browsertrix Crawler default arguments values (#416) ### Fixed diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index 49ead05..8634b71 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -129,6 +129,7 @@ def run(raw_args): ) parser.add_argument("-u", "--url", help="The URL to start crawling from") + parser.add_argument("--title", help="ZIM title") parser.add_argument("--description", help="ZIM description") parser.add_argument("--long-description", help="ZIM long description metadata") @@ -138,52 +139,66 @@ def run(raw_args): help="If set, read a list of seed urls, one per line, from the specified", ) - parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers") + parser.add_argument( + "-w", "--workers", type=int, help="Number of parallel workers. Default is 1." + ) + + parser.add_argument( + "--crawlId", + help="A user provided ID for this crawl or crawl configuration (can also be " + "set via CRAWL_ID env var, defaults to hostname)", + ) parser.add_argument( "--waitUntil", help="Puppeteer page.goto() condition to wait for before continuing. One of " "load, domcontentloaded, networkidle0 or networkidle2, or a " - "comma-separated combination of those.", - default="load", + "comma-separated combination of those. Default is load,networkidle2", ) parser.add_argument( - "--depth", help="The depth of the crawl for all seeds", type=int, default=-1 + "--depth", + help="The depth of the crawl for all seeds. Default is -1.", + type=int, ) parser.add_argument( "--extraHops", - help="Number of extra 'hops' to follow, beyond the current scope", + help="Number of extra 'hops' to follow, beyond the current scope. " + "Default is 0.", type=int, ) - parser.add_argument("--limit", help="Limit crawl to this number of pages", type=int) + parser.add_argument( + "--limit", + help="Limit crawl to this number of pages. Default is 0 (no limit).", + type=int, + ) parser.add_argument( "--maxPageLimit", - help="Maximum pages to crawl, overriding pageLimit if both are set", + help="Maximum pages to crawl, overriding pageLimit if both are set. Default is " + "0 (no limit)", type=int, ) parser.add_argument( "--timeout", - help="Timeout for each page to load (in seconds)", + help="Timeout for each page to load (in seconds). Default is 90 secs.", type=int, - default=90, ) parser.add_argument( "--scopeType", help="A predfined scope of the crawl. For more customization, " - "use 'custom' and set scopeIncludeRx regexes", + "use 'custom' and set scopeIncludeRx/scopeExcludeRx regexes. Default is custom" + "if scopeIncludeRx is set, prefix otherwise.", choices=["page", "page-spa", "prefix", "host", "domain", "any", "custom"], ) parser.add_argument( "--include", - help="Regex of page URLs that should be " - "included in the crawl (defaults to " + help="Regex of page URLs that should be included in the crawl (defaults to " "the immediate directory of URL)", ) @@ -192,48 +207,185 @@ def run(raw_args): help="Regex of page URLs that should be excluded from the crawl", ) - parser.add_argument( - "--collection", - help="Collection name to crawl to (replay will be accessible " - "under this name in pywb preview) instead of crawl-@ts", - ) - parser.add_argument( "--allowHashUrls", - help="Allow Hashtag URLs, useful for " - "single-page-application crawling or " - "when different hashtags load dynamic " - "content", + help="Allow Hashtag URLs, useful for single-page-application crawling or " + "when different hashtags load dynamic content", action="store_true", ) parser.add_argument( - "--lang", - help="if set, sets the language used by the browser, should be ISO 639 " - "language[-country] code", + "--selectLinks", + help="One or more selectors for extracting links, in the format " + "[css selector]->[property to use],[css selector]->@[attribute to use]", ) parser.add_argument( - "--zim-lang", - help="Language metadata of ZIM " - "(warc2zim --lang param). ISO-639-3 code. " - "Retrieved from homepage if found, fallback to `eng`", + "--clickSelector", + help="Selector for elements to click when using the autoclick behavior. Default" + " is 'a'", ) + parser.add_argument( + "--blockRules", + help="Additional rules for blocking certain URLs from being loaded, by URL " + "regex and optionally via text match in an iframe", + ) + + parser.add_argument( + "--blockMessage", + help="If specified, when a URL is blocked, a record with this error message is" + " added instead", + ) + + parser.add_argument( + "--blockAds", + help="If set, block advertisements from being loaded (based on Stephen Black's" + " blocklist). Note that some bad domains are also blocked by zimit" + " configuration even if this option is not set.", + ) + + parser.add_argument( + "--adBlockMessage", + help="If specified, when an ad is blocked, a record with this error message is" + " added instead", + ) + + parser.add_argument( + "--collection", + help="Collection name to crawl to (replay will be accessible " + "under this name in pywb preview). Default is crawl-@ts.", + ) + + parser.add_argument( + "--headless", + help="Run in headless mode, otherwise start xvfb", + action="store_true", + ) + + parser.add_argument( + "--driver", + help="Custom driver for the crawler, if any", + ) + + parser.add_argument( + "--generateCDX", + help="If set, generate index (CDXJ) for use with pywb after crawl is done", + action="store_true", + ) + + parser.add_argument( + "--combineWARC", + help="If set, combine the warcs", + action="store_true", + ) + + parser.add_argument( + "--rolloverSize", + help="If set, declare the rollover size. Default is 1000000000.", + type=int, + ) + + parser.add_argument( + "--generateWACZ", + help="If set, generate WACZ on disk", + action="store_true", + ) + + parser.add_argument( + "--logging", + help="Crawler logging configuration", + ) + + parser.add_argument( + "--logLevel", + help="Comma-separated list of log levels to include in logs", + ) + + parser.add_argument( + "--logContext", + help="Comma-separated list of contexts to include in logs", + choices=[ + "general", + "worker", + "recorder", + "recorderNetwork", + "writer", + "state", + "redis", + "storage", + "text", + "exclusion", + "screenshots", + "screencast", + "originOverride", + "healthcheck", + "browser", + "blocking", + "behavior", + "behaviorScript", + "jsError", + "fetch", + "pageStatus", + "memoryStatus", + "crawlStatus", + "links", + "sitemap", + "wacz", + "replay", + "proxy", + ], + ) + + parser.add_argument( + "--logExcludeContext", + help="Comma-separated list of contexts to NOT include in logs. Default is " + "recorderNetwork,jsError,screencast", + choices=[ + "general", + "worker", + "recorder", + "recorderNetwork", + "writer", + "state", + "redis", + "storage", + "text", + "exclusion", + "screenshots", + "screencast", + "originOverride", + "healthcheck", + "browser", + "blocking", + "behavior", + "behaviorScript", + "jsError", + "fetch", + "pageStatus", + "memoryStatus", + "crawlStatus", + "links", + "sitemap", + "wacz", + "replay", + "proxy", + ], + ) + + parser.add_argument( + "--text", + help="Extract initial (default) or final text to pages.jsonl or WARC resource" + " record(s)", + ) + + # cwd is not manipulable + parser.add_argument( "--mobileDevice", help="Emulate mobile device by name from " "https://github.com/puppeteer/puppeteer/blob/" "main/packages/puppeteer-core/src/common/Device.ts", - default="Pixel 2", - ) - - parser.add_argument( - "--noMobileDevice", - help="Do not emulate a mobile device (use at your own risk, behavior is" - "uncertain)", - action="store_true", - default=False, ) parser.add_argument( @@ -255,33 +407,108 @@ def run(raw_args): "(usually /sitemap.xml)", ) + parser.add_argument( + "--sitemapFromDate", + help="If set, filter URLs from sitemaps to those greater than or equal to (>=)" + " provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)", + ) + + parser.add_argument( + "--sitemapToDate", + help="If set, filter URLs from sitemaps to those less than or equal to (<=) " + "provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)", + ) + + parser.add_argument( + "--statsFilename", + help="If set, output stats as JSON to this file. (Relative filename resolves " + "to crawl working directory)", + ) + parser.add_argument( "--behaviors", - help="Which background behaviors to enable on each page", - default="autoplay,autofetch,siteSpecific", + help="Which background behaviors to enable on each page. Default is autoplay," + "autofetch,autoscroll,siteSpecific", ) parser.add_argument( "--behaviorTimeout", help="If >0, timeout (in seconds) for in-page behavior will run on each page. " - "If 0, a behavior can run until finish", + "If 0, a behavior can run until finish. Default is 90.", + type=int, + ) + + parser.add_argument( + "--postLoadDelay", + help="If >0, amount of time to sleep (in seconds) after page has loaded, before" + " taking screenshots / getting text / running behaviors. Default is 0.", type=int, - default=90, ) parser.add_argument( "--delay", help="If >0, amount of time to sleep (in seconds) after behaviors " - "before moving on to next page", + "before moving on to next page. Default is 0.", type=int, ) + parser.add_argument( + "--dedupPolicy", + help="Deduplication policy. Default is skip", + choices=["skip", "revisit", "keep"], + ) + parser.add_argument( "--profile", help="Path or HTTP(S) URL to tar.gz file which contains the browser profile " "directory", ) + parser.add_argument( + "--screenshot", + help="Screenshot options for crawler. One of view, thumbnail, fullPage, " + "fullPageFinal or a comma-separated combination of those.", + ) + + parser.add_argument( + "--screencastPort", + help="If set to a non-zero value, starts an HTTP server with screencast " + "accessible on this port.", + type=int, + ) + + parser.add_argument( + "--screencastRedis", + help="If set, will use the state store redis pubsub for screencasting", + action="store_true", + ) + + parser.add_argument( + "--warcInfo", + help="Optional fields added to the warcinfo record in combined WARCs", + ) + + parser.add_argument( + "--saveState", + help="If the crawl state should be serialized to the crawls/ directory. " + "Defaults to 'partial', only saved when crawl is interrupted", + choices=["never", "partial", "always"], + ) + + parser.add_argument( + "--saveStateInterval", + help="If save state is set to 'always', also save state during the crawl at " + "this interval (in seconds). Default to 300.", + type=int, + ) + + parser.add_argument( + "--saveStateHistory", + help="Number of save states to keep during the duration of a crawl. " + "Default to 5.", + type=int, + ) + size_group = parser.add_mutually_exclusive_group() size_group.add_argument( "--sizeSoftLimit", @@ -329,7 +556,134 @@ def run(raw_args): help="overwrite current crawl data: if set, existing collection directory " "will be deleted before crawl is started", action="store_true", - default=False, + ) + + parser.add_argument( + "--waitOnDone", + help="if set, wait for interrupt signal when finished instead of exiting", + action="store_true", + ) + + parser.add_argument( + "--restartsOnError", + help="if set, assume will be restarted if interrupted, don't run post-crawl " + "processes on interrupt", + action="store_true", + ) + + parser.add_argument( + "--netIdleWait", + help="If set, wait for network idle after page load and after behaviors are " + "done (in seconds). if -1 (default), determine based on scope.", + type=int, + ) + + parser.add_argument( + "--lang", + help="if set, sets the language used by the browser, should be ISO 639 " + "language[-country] code", + ) + + parser.add_argument( + "--originOverride", + help="if set, will redirect requests from each origin in key to origin in the " + "value, eg. --originOverride https://host:port=http://alt-host:alt-port", + ) + + parser.add_argument( + "--logErrorsToRedis", + help="If set, write error messages to redis", + action="store_true", + ) + + parser.add_argument( + "--writePagesToRedis", + help="If set, write page objects to redis", + action="store_true", + ) + + parser.add_argument( + "--maxPageRetries", + help="If set, number of times to retry a page that failed to load before page" + " is considered to have failed. Default is 2.", + type=int, + ) + + parser.add_argument( + "--failOnFailedSeed", + help="If set, crawler will fail with exit code 1 if any seed fails. When " + "combined with --failOnInvalidStatus, will result in crawl failing with exit " + "code 1 if any seed has a 4xx/5xx response", + action="store_true", + ) + + parser.add_argument( + "--failOnFailedLimit", + help="If set, save state and exit if number of failed pages exceeds this value", + action="store_true", + ) + + parser.add_argument( + "--failOnInvalidStatus", + help="If set, will treat pages with 4xx or 5xx response as failures. When " + "combined with --failOnFailedLimit or --failOnFailedSeed may result in crawl " + "failing due to non-200 responses", + action="store_true", + ) + + # customBehaviors not included because it has special handling + # debugAccessRedis not included due to custom redis engine in zimit + + parser.add_argument( + "--debugAccessBrowser", + help="if set, allow debugging browser on port 9222 via CDP", + action="store_true", + ) + + parser.add_argument( + "--warcPrefix", + help="prefix for WARC files generated, including WARCs added to WACZ", + ) + + parser.add_argument( + "--serviceWorker", + help="service worker handling: disabled, enabled or disabled-if-profile. " + "Default: disabled.", + ) + + parser.add_argument( + "--proxyServer", + help="if set, will use specified proxy server. Takes precedence over any env " + "var proxy settings", + ) + + parser.add_argument( + "--dryRun", + help="If true, no archive data is written to disk, only pages and logs (and " + "optionally saved state).", + action="store_true", + ) + + parser.add_argument( + "--qaSource", + help="Required for QA mode. Source (WACZ or multi WACZ) for QA", + ) + + parser.add_argument( + "--qaDebugImageDiff", + help="if specified, will write crawl.png, replay.png and diff.png for each " + "page where they're different", + action="store_true", + ) + + parser.add_argument( + "--sshProxyPrivateKeyFile", + help="path to SSH private key for SOCKS5 over SSH proxy connection", + ) + + parser.add_argument( + "--sshProxyKnownHostsFile", + help="path to SSH known hosts file for SOCKS5 over SSH proxy connection", ) parser.add_argument( @@ -355,11 +709,6 @@ def run(raw_args): help="[warc2zim] Custom CSS file URL/path to inject into all articles", ) - parser.add_argument( - "--statsFilename", - help="If set, output stats as JSON to this file", - ) - parser.add_argument( "--config", help="Path to YAML config file. If set, browsertrix-crawler will use this file" @@ -374,8 +723,10 @@ def run(raw_args): ) parser.add_argument( - "--logging", - help="Crawler logging configuration", + "--zim-lang", + help="Language metadata of ZIM " + "(warc2zim --lang param). ISO-639-3 code. " + "Retrieved from homepage if found, fallback to `eng`", ) parser.add_argument( @@ -497,10 +848,6 @@ def run(raw_args): cmd_args.append("--userAgentSuffix") cmd_args.append(user_agent_suffix) - if not zimit_args.noMobileDevice: - cmd_args.append("--mobileDevice") - cmd_args.append(zimit_args.mobileDevice) - cmd_args.append("--cwd") cmd_args.append(str(temp_root_dir)) @@ -681,13 +1028,14 @@ def get_cleaned_url(url: str): def get_node_cmd_line(args): - node_cmd = ["crawl", "--failOnFailedSeed"] + node_cmd = ["crawl"] for arg in [ - "workers", - "waitUntil", - "urlFile", "title", "description", + "urlFile", + "workers", + "crawlId", + "waitUntil", "depth", "extraHops", "limit", @@ -698,13 +1046,44 @@ def get_node_cmd_line(args): "exclude", "collection", "allowHashUrls", - "lang", + "selectLinks", + "clickSelector", + "blockRules", + "blockMessage", + "blockAds", + "adBlockMessage", + "collection", + "headless", + "driver", + "generateCDX", + "combineWARC", + "rolloverSize", + "generateWACZ", + "logging", + "logLevel", + "logContext", + "logExcludeContext", + "text", + "mobileDevice", "userAgent", + # userAgentSuffix (manipulated), "useSitemap", + "sitemapFromDate", + "sitemapToDate", + # statsFilename (manipulated), "behaviors", "behaviorTimeout", + "postLoadDelay", "delay", + "dedupPolicy", "profile", + "screenshot", + "screencastPort", + "screencastRedis", + "warcInfo", + "saveState", + "saveStateInterval", + "saveStateHistory", "sizeSoftLimit", "sizeHardLimit", "diskUtilization", @@ -712,9 +1091,28 @@ def get_node_cmd_line(args): "timeHardLimit", "healthCheckPort", "overwrite", - "config", - "logging", + "waitOnDone", + "restartsOnError", + "netIdleWait", + "lang", + "originOverride", + "logErrorsToRedis", + "writePagesToRedis", + "maxPageRetries", + "failOnFailedSeed", + "failOnFailedLimit", + "failOnInvalidStatus", + "debugAccessBrowser", + "warcPrefix", + "serviceWorker", + "proxyServer", + "dryRun", + "qaSource", + "qaDebugImageDiff", + "sshProxyPrivateKeyFile", + "sshProxyKnownHostsFile", "customBehaviors", + "config", ]: value = getattr(args, arg) if arg == "userAgent":