Fix handling of '--collection' parameter + add '--tmp' + enhance logging

This commit is contained in:
benoit74 2023-11-23 08:51:48 +01:00
parent 79d5f8bc7b
commit b98e8f7027
No known key found for this signature in database
GPG Key ID: B89606434FC7B530
2 changed files with 38 additions and 8 deletions

View File

@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## Unreleased
### Added
- `--tmp` parameter (optional)
### Fixed
- `--collection` parameter was not working (#252)
## [1.6.2] - 2023-11-17
### Changed

View File

@ -308,8 +308,12 @@ def zimit(args=None):
action="store_true",
)
parser.add_argument("--output", help="Output directory for ZIM", default="/output")
parser.add_argument(
"--output", help="Output directory for ZIM and WARC files", default="/output"
"--tmp",
help="Temporary directory for WARC files (if not set, will be created"
"as a temporary subdir of output directory)",
)
parser.add_argument("--adminEmail", help="Admin Email for Zimit crawler")
@ -377,6 +381,9 @@ def zimit(args=None):
print("Exiting, invalid warc2zim params")
return 2
if zimit_args.tmp:
temp_root_dir = Path(zimit_args.tmp)
else:
# make temp dir for this crawl
temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp"))
@ -430,14 +437,29 @@ def zimit(args=None):
elif crawl.returncode != 0:
raise subprocess.CalledProcessError(crawl.returncode, cmd_args)
warc_files = list(temp_root_dir.rglob("collections/crawl-*/archive/"))[-1]
warc2zim_args.append(str(warc_files))
num_files = sum(1 for e in warc_files.iterdir())
if zimit_args.collection:
warc_files = temp_root_dir.joinpath(
f"collections/{zimit_args.collection}/archive/"
)
else:
warc_files = list(temp_root_dir.rglob("collections/crawl-*/archive/"))
if len(warc_files) == 0:
raise RuntimeError(
"Failed to find directory where WARC files have been created"
)
elif len(warc_files) > 1:
print("Found many WARC files directories, only last one will be used")
for directory in warc_files:
print(f"- {directory}")
warc_files = warc_files[-1]
print("")
print("----------")
print(f"Processing {num_files} WARC files to ZIM", flush=True)
print(f"Processing WARC files in {warc_files}")
warc2zim_args.append(str(warc_files))
num_files = sum(1 for _ in warc_files.iterdir())
print(f"{num_files} WARC files found", flush=True)
return warc2zim(warc2zim_args)