Fixes following review + we need to create on subdir per run to not mix data / cleanup correctly afer run

This commit is contained in:
benoit74 2023-11-23 13:08:45 +01:00
parent a2b4c71ec9
commit d6c0c6ce63
No known key found for this signature in database
GPG Key ID: B89606434FC7B530
2 changed files with 18 additions and 17 deletions

View File

@ -5,11 +5,13 @@ All notable changes to this project are documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0). and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).
## Unreleased ## [Unreleased]
### Added ### Added
- `--tmp` parameter (optional) - New `--build` parameter (optional) to specify the directory holding Browsertrix files ; if not set, `--output`
directory is used ; zimit creates one subdir of this folder per invocation to isolate datasets ; subdir is kept only
if `--keep` is set.
### Fixed ### Fixed

View File

@ -311,9 +311,8 @@ def zimit(args=None):
parser.add_argument("--output", help="Output directory for ZIM", default="/output") parser.add_argument("--output", help="Output directory for ZIM", default="/output")
parser.add_argument( parser.add_argument(
"--tmp", "--build",
help="Temporary directory for WARC files (if not set, will be created" help="Build directory for WARC files (if not set, output directory is used)",
"as a temporary subdir of output directory)",
) )
parser.add_argument("--adminEmail", help="Admin Email for Zimit crawler") parser.add_argument("--adminEmail", help="Admin Email for Zimit crawler")
@ -381,10 +380,10 @@ def zimit(args=None):
print("Exiting, invalid warc2zim params") print("Exiting, invalid warc2zim params")
return 2 return 2
if zimit_args.tmp: # make temp dir for this crawl
temp_root_dir = Path(zimit_args.tmp) if zimit_args.build:
temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.build, prefix=".tmp"))
else: else:
# make temp dir for this crawl
temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp")) temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp"))
if not zimit_args.keep: if not zimit_args.keep:
@ -438,27 +437,27 @@ def zimit(args=None):
raise subprocess.CalledProcessError(crawl.returncode, cmd_args) raise subprocess.CalledProcessError(crawl.returncode, cmd_args)
if zimit_args.collection: if zimit_args.collection:
warc_files = temp_root_dir.joinpath( warc_directory = temp_root_dir.joinpath(
f"collections/{zimit_args.collection}/archive/" f"collections/{zimit_args.collection}/archive/"
) )
else: else:
warc_files = list(temp_root_dir.rglob("collections/crawl-*/archive/")) warc_dirs = list(temp_root_dir.rglob("collections/crawl-*/archive/"))
if len(warc_files) == 0: if len(warc_dirs) == 0:
raise RuntimeError( raise RuntimeError(
"Failed to find directory where WARC files have been created" "Failed to find directory where WARC files have been created"
) )
elif len(warc_files) > 1: elif len(warc_dirs) > 1:
print("Found many WARC files directories, only last one will be used") print("Found many WARC files directories, only last one will be used")
for directory in warc_files: for directory in warc_dirs:
print(f"- {directory}") print(f"- {directory}")
warc_files = warc_files[-1] warc_directory = warc_dirs[-1]
print("") print("")
print("----------") print("----------")
print(f"Processing WARC files in {warc_files}") print(f"Processing WARC files in {warc_directory}")
warc2zim_args.append(str(warc_files)) warc2zim_args.append(str(warc_directory))
num_files = sum(1 for _ in warc_files.iterdir()) num_files = sum(1 for _ in warc_directory.iterdir())
print(f"{num_files} WARC files found", flush=True) print(f"{num_files} WARC files found", flush=True)
print(f"Calling warc2zim with these args: {warc2zim_args}", flush=True) print(f"Calling warc2zim with these args: {warc2zim_args}", flush=True)