upgraded to browsertrix-crawler 0.9.0

2025-09-22 11:22:23 -04:00 · 2023-04-10 13:08:12 +00:00 · 2023-04-10 13:08:12 +00:00 · 8ecd0a3210
commit 8ecd0a3210
parent 4f676e37c7
3 changed files with 48 additions and 3 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,9 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ## [Unreleased]

+### Added
+
+- `--title` to set ZIM title
+- `--description` to set ZIM description
+- New crawler options: `--maxPageLimit`, `--delay`, `--diskUtilization`
+
 ### Changed

- Using browsertrix-crawler 0.9.0-beta.2
+- Using browsertrix-crawler 0.9.0
 - Default and accepted values for `--waitUntil` from crawler's update
 - Using `main` warc2zim ⚠️ change before releasing!
 - Disabled Chrome updates to prevent incidental inclusion of update data in WARC/ZIM (#172)
--- a/2
+++ b/2
@ -1,4 +1,4 @@
-FROM webrecorder/browsertrix-crawler:0.9.0-beta.2
+FROM webrecorder/browsertrix-crawler:0.9.0
 LABEL org.opencontainers.image.source https://github.com/openzim/zimit

 RUN apt-get update && apt-get install -qqy --no-install-recommends libmagic1 && apt-get clean && rm -rf /var/lib/apt/lists/*
--- a/zimit.py
+++ b/zimit.py
@ -124,6 +124,8 @@ def zimit(args=None):
    )

    parser.add_argument("-u", "--url", help="The URL to start crawling from")
+    parser.add_argument("--title", help="ZIM title")
+    parser.add_argument("--description", help="ZIM description")

    parser.add_argument(
        "--urlFile",
@ -155,6 +157,13 @@ def zimit(args=None):
        "--limit", help="Limit crawl to this number of pages", type=int, default=0
    )

+    parser.add_argument(
+        "--maxPageLimit",
+        help="Maximum pages to crawl, overriding pageLimit if both are set",
+        type=int,
+        default=0,
+    )
+
    parser.add_argument(
        "--timeout",
        help="Timeout for each page to load (in seconds)",
@ -217,7 +226,8 @@ def zimit(args=None):

    parser.add_argument(
        "--useSitemap",
-        help="If set, use the URL as sitemap to get additional URLs for the crawl (usually /sitemap.xml)",
+        help="If set, use the URL as sitemap to get additional URLs for the crawl "
+        "(usually /sitemap.xml)",
    )

    parser.add_argument(
@ -234,6 +244,14 @@ def zimit(args=None):
        default=90,
    )

+    parser.add_argument(
+        "--delay",
+        help="If >0, amount of time to sleep (in seconds) after behaviors "
+        "before moving on to next page",
+        type=int,
+        default=0,
+    )
+
    parser.add_argument(
        "--profile",
        help="Path to tar.gz file which will be extracted "
@ -247,6 +265,14 @@ def zimit(args=None):
        default=0,
    )

+    parser.add_argument(
+        "--diskUtilization",
+        help="If set, save state and exit if diskutilization "
+        "exceeds this percentage value",
+        type=int,
+        default=90,
+    )
+
    parser.add_argument(
        "--timeLimit",
        help="If set, save state and exit after time limit, in seconds",
@ -308,6 +334,14 @@ def zimit(args=None):
    if zimit_args.custom_css:
        warc2zim_args += ["--custom-css", zimit_args.custom_css]

+    if zimit_args.title:
+        warc2zim_args.append("--title")
+        warc2zim_args.append(zimit_args.title)
+
+    if zimit_args.description:
+        warc2zim_args.append("--description")
+        warc2zim_args.append(zimit_args.description)
+
    print("----------")
    print("Testing warc2zim args")
    print("Running: warc2zim " + " ".join(warc2zim_args), flush=True)
@ -430,9 +464,12 @@ def get_node_cmd_line(args):
        "workers",
        "waitUntil",
        "urlFile",
+        "title",
+        "description",
        "depth",
        "extraHops",
        "limit",
+        "maxPageLimit",
        "timeout",
        "scopeType",
        "include",
@ -444,8 +481,10 @@ def get_node_cmd_line(args):
        "useSitemap",
        "behaviors",
        "behaviorTimeout",
+        "delay",
        "profile",
        "sizeLimit",
+        "diskUtilization",
        "timeLimit",
        "healthCheckPort",
        "overwrite",