From d24775d70cf1c5d5e954284da04ef5aa2c0857c4 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 15 Nov 2023 15:11:42 +0100 Subject: [PATCH 1/3] Fix logic passing args to crawler - do not set arg only if value is None or False - remove default value 0 from args (this was not passed but would be with new corrected code and would induce a different crawler behavior in fact) --- CHANGELOG.md | 4 ++++ zimit.py | 17 ++++++----------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3db9c67..72f838b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +### Fixed + +- Fix logic passing args to crawler to support value '0' (#245) + ## [1.6.1] - 2023-11-06 ### Changed diff --git a/zimit.py b/zimit.py index 963f832..cf8b970 100755 --- a/zimit.py +++ b/zimit.py @@ -151,18 +151,16 @@ def zimit(args=None): "--extraHops", help="Number of extra 'hops' to follow, beyond the current scope", type=int, - default=0, ) parser.add_argument( - "--limit", help="Limit crawl to this number of pages", type=int, default=0 + "--limit", help="Limit crawl to this number of pages", type=int ) parser.add_argument( "--maxPageLimit", help="Maximum pages to crawl, overriding pageLimit if both are set", type=int, - default=0, ) parser.add_argument( @@ -263,7 +261,6 @@ def zimit(args=None): help="If >0, amount of time to sleep (in seconds) after behaviors " "before moving on to next page", type=int, - default=0, ) parser.add_argument( @@ -276,7 +273,6 @@ def zimit(args=None): "--sizeLimit", help="If set, save state and exit if size limit exceeds this value", type=int, - default=0, ) parser.add_argument( @@ -291,14 +287,12 @@ def zimit(args=None): "--timeLimit", help="If set, save state and exit after time limit, in seconds", type=int, - default=0, ) parser.add_argument( "--healthCheckPort", help="port to run healthcheck on", type=int, - default=0, ) parser.add_argument( @@ -522,10 +516,11 @@ def get_node_cmd_line(args): "config", ]: value = getattr(args, arg) - if value: - node_cmd.append("--" + arg) - if not isinstance(value, bool): - node_cmd.append(str(value)) + if value == None or (isinstance(value, bool) and value == False): + continue + node_cmd.append("--" + arg) + if not isinstance(value, bool): + node_cmd.append(str(value)) return node_cmd From 4ad41a7d54436b24c25d6226a840399e8bb2a97c Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 15 Nov 2023 15:23:03 +0100 Subject: [PATCH 2/3] Upgrade to browsertrix crawler 0.12.2 --- CHANGELOG.md | 4 ++++ Dockerfile | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 72f838b..894a22e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +### Changed + +- Using browsertrix-crawler 0.12.2 + ### Fixed - Fix logic passing args to crawler to support value '0' (#245) diff --git a/Dockerfile b/Dockerfile index 4e97c5a..ae3caea 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM webrecorder/browsertrix-crawler:0.12.1 +FROM webrecorder/browsertrix-crawler:0.12.2 LABEL org.opencontainers.image.source https://github.com/openzim/zimit RUN apt-get update \ From 51d04091289af5bab4c67bd49d407d8f250af422 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 16 Nov 2023 08:22:23 +0100 Subject: [PATCH 3/3] Add venv to gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 7da9385..ddea4cf 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ __pycache__ collections/ node_modules/ output/ +venv \ No newline at end of file