From 8ecd0a321045c93d0f7032a3ef6560e1f2adbfed Mon Sep 17 00:00:00 2001 From: renaud gaudin Date: Mon, 10 Apr 2023 13:08:12 +0000 Subject: [PATCH] upgraded to browsertrix-crawler 0.9.0 --- CHANGELOG.md | 8 +++++++- Dockerfile | 2 +- zimit.py | 41 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 48 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 55f4771..8cf9725 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,9 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- `--title` to set ZIM title +- `--description` to set ZIM description +- New crawler options: `--maxPageLimit`, `--delay`, `--diskUtilization` + ### Changed -- Using browsertrix-crawler 0.9.0-beta.2 +- Using browsertrix-crawler 0.9.0 - Default and accepted values for `--waitUntil` from crawler's update - Using `main` warc2zim ⚠️ change before releasing! - Disabled Chrome updates to prevent incidental inclusion of update data in WARC/ZIM (#172) diff --git a/Dockerfile b/Dockerfile index 6923720..b402288 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM webrecorder/browsertrix-crawler:0.9.0-beta.2 +FROM webrecorder/browsertrix-crawler:0.9.0 LABEL org.opencontainers.image.source https://github.com/openzim/zimit RUN apt-get update && apt-get install -qqy --no-install-recommends libmagic1 && apt-get clean && rm -rf /var/lib/apt/lists/* diff --git a/zimit.py b/zimit.py index 0e03bed..97d098f 100755 --- a/zimit.py +++ b/zimit.py @@ -124,6 +124,8 @@ def zimit(args=None): ) parser.add_argument("-u", "--url", help="The URL to start crawling from") + parser.add_argument("--title", help="ZIM title") + parser.add_argument("--description", help="ZIM description") parser.add_argument( "--urlFile", @@ -155,6 +157,13 @@ def zimit(args=None): "--limit", help="Limit crawl to this number of pages", type=int, default=0 ) + parser.add_argument( + "--maxPageLimit", + help="Maximum pages to crawl, overriding pageLimit if both are set", + type=int, + default=0, + ) + parser.add_argument( "--timeout", help="Timeout for each page to load (in seconds)", @@ -217,7 +226,8 @@ def zimit(args=None): parser.add_argument( "--useSitemap", - help="If set, use the URL as sitemap to get additional URLs for the crawl (usually /sitemap.xml)", + help="If set, use the URL as sitemap to get additional URLs for the crawl " + "(usually /sitemap.xml)", ) parser.add_argument( @@ -234,6 +244,14 @@ def zimit(args=None): default=90, ) + parser.add_argument( + "--delay", + help="If >0, amount of time to sleep (in seconds) after behaviors " + "before moving on to next page", + type=int, + default=0, + ) + parser.add_argument( "--profile", help="Path to tar.gz file which will be extracted " @@ -247,6 +265,14 @@ def zimit(args=None): default=0, ) + parser.add_argument( + "--diskUtilization", + help="If set, save state and exit if diskutilization " + "exceeds this percentage value", + type=int, + default=90, + ) + parser.add_argument( "--timeLimit", help="If set, save state and exit after time limit, in seconds", @@ -308,6 +334,14 @@ def zimit(args=None): if zimit_args.custom_css: warc2zim_args += ["--custom-css", zimit_args.custom_css] + if zimit_args.title: + warc2zim_args.append("--title") + warc2zim_args.append(zimit_args.title) + + if zimit_args.description: + warc2zim_args.append("--description") + warc2zim_args.append(zimit_args.description) + print("----------") print("Testing warc2zim args") print("Running: warc2zim " + " ".join(warc2zim_args), flush=True) @@ -430,9 +464,12 @@ def get_node_cmd_line(args): "workers", "waitUntil", "urlFile", + "title", + "description", "depth", "extraHops", "limit", + "maxPageLimit", "timeout", "scopeType", "include", @@ -444,8 +481,10 @@ def get_node_cmd_line(args): "useSitemap", "behaviors", "behaviorTimeout", + "delay", "profile", "sizeLimit", + "diskUtilization", "timeLimit", "healthCheckPort", "overwrite",