From b98e8f7027b5b79e44526e7318af1a41a371562b Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 23 Nov 2023 08:51:48 +0100 Subject: [PATCH] Fix handling of '--collection' parameter + add '--tmp' + enhance logging --- CHANGELOG.md | 8 ++++++++ zimit.py | 38 ++++++++++++++++++++++++++++++-------- 2 files changed, 38 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 45136ac..5ceec97 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +### Added + +- `--tmp` parameter (optional) + +### Fixed + +- `--collection` parameter was not working (#252) + ## [1.6.2] - 2023-11-17 ### Changed diff --git a/zimit.py b/zimit.py index 331e025..6279e3d 100755 --- a/zimit.py +++ b/zimit.py @@ -308,8 +308,12 @@ def zimit(args=None): action="store_true", ) + parser.add_argument("--output", help="Output directory for ZIM", default="/output") + parser.add_argument( - "--output", help="Output directory for ZIM and WARC files", default="/output" + "--tmp", + help="Temporary directory for WARC files (if not set, will be created" + "as a temporary subdir of output directory)", ) parser.add_argument("--adminEmail", help="Admin Email for Zimit crawler") @@ -377,8 +381,11 @@ def zimit(args=None): print("Exiting, invalid warc2zim params") return 2 - # make temp dir for this crawl - temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp")) + if zimit_args.tmp: + temp_root_dir = Path(zimit_args.tmp) + else: + # make temp dir for this crawl + temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp")) if not zimit_args.keep: @@ -430,14 +437,29 @@ def zimit(args=None): elif crawl.returncode != 0: raise subprocess.CalledProcessError(crawl.returncode, cmd_args) - warc_files = list(temp_root_dir.rglob("collections/crawl-*/archive/"))[-1] - warc2zim_args.append(str(warc_files)) - - num_files = sum(1 for e in warc_files.iterdir()) + if zimit_args.collection: + warc_files = temp_root_dir.joinpath( + f"collections/{zimit_args.collection}/archive/" + ) + else: + warc_files = list(temp_root_dir.rglob("collections/crawl-*/archive/")) + if len(warc_files) == 0: + raise RuntimeError( + "Failed to find directory where WARC files have been created" + ) + elif len(warc_files) > 1: + print("Found many WARC files directories, only last one will be used") + for directory in warc_files: + print(f"- {directory}") + warc_files = warc_files[-1] print("") print("----------") - print(f"Processing {num_files} WARC files to ZIM", flush=True) + print(f"Processing WARC files in {warc_files}") + warc2zim_args.append(str(warc_files)) + + num_files = sum(1 for _ in warc_files.iterdir()) + print(f"{num_files} WARC files found", flush=True) return warc2zim(warc2zim_args)