mirror of
https://github.com/openzim/zimit.git
synced 2025-09-24 04:30:11 -04:00
Fix handling of '--collection' parameter + add '--tmp' + enhance logging
This commit is contained in:
parent
79d5f8bc7b
commit
b98e8f7027
@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
|
|
||||||
## Unreleased
|
## Unreleased
|
||||||
|
|
||||||
|
### Added
|
||||||
|
|
||||||
|
- `--tmp` parameter (optional)
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
|
||||||
|
- `--collection` parameter was not working (#252)
|
||||||
|
|
||||||
## [1.6.2] - 2023-11-17
|
## [1.6.2] - 2023-11-17
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
|
38
zimit.py
38
zimit.py
@ -308,8 +308,12 @@ def zimit(args=None):
|
|||||||
action="store_true",
|
action="store_true",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument("--output", help="Output directory for ZIM", default="/output")
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--output", help="Output directory for ZIM and WARC files", default="/output"
|
"--tmp",
|
||||||
|
help="Temporary directory for WARC files (if not set, will be created"
|
||||||
|
"as a temporary subdir of output directory)",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument("--adminEmail", help="Admin Email for Zimit crawler")
|
parser.add_argument("--adminEmail", help="Admin Email for Zimit crawler")
|
||||||
@ -377,8 +381,11 @@ def zimit(args=None):
|
|||||||
print("Exiting, invalid warc2zim params")
|
print("Exiting, invalid warc2zim params")
|
||||||
return 2
|
return 2
|
||||||
|
|
||||||
# make temp dir for this crawl
|
if zimit_args.tmp:
|
||||||
temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp"))
|
temp_root_dir = Path(zimit_args.tmp)
|
||||||
|
else:
|
||||||
|
# make temp dir for this crawl
|
||||||
|
temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp"))
|
||||||
|
|
||||||
if not zimit_args.keep:
|
if not zimit_args.keep:
|
||||||
|
|
||||||
@ -430,14 +437,29 @@ def zimit(args=None):
|
|||||||
elif crawl.returncode != 0:
|
elif crawl.returncode != 0:
|
||||||
raise subprocess.CalledProcessError(crawl.returncode, cmd_args)
|
raise subprocess.CalledProcessError(crawl.returncode, cmd_args)
|
||||||
|
|
||||||
warc_files = list(temp_root_dir.rglob("collections/crawl-*/archive/"))[-1]
|
if zimit_args.collection:
|
||||||
warc2zim_args.append(str(warc_files))
|
warc_files = temp_root_dir.joinpath(
|
||||||
|
f"collections/{zimit_args.collection}/archive/"
|
||||||
num_files = sum(1 for e in warc_files.iterdir())
|
)
|
||||||
|
else:
|
||||||
|
warc_files = list(temp_root_dir.rglob("collections/crawl-*/archive/"))
|
||||||
|
if len(warc_files) == 0:
|
||||||
|
raise RuntimeError(
|
||||||
|
"Failed to find directory where WARC files have been created"
|
||||||
|
)
|
||||||
|
elif len(warc_files) > 1:
|
||||||
|
print("Found many WARC files directories, only last one will be used")
|
||||||
|
for directory in warc_files:
|
||||||
|
print(f"- {directory}")
|
||||||
|
warc_files = warc_files[-1]
|
||||||
|
|
||||||
print("")
|
print("")
|
||||||
print("----------")
|
print("----------")
|
||||||
print(f"Processing {num_files} WARC files to ZIM", flush=True)
|
print(f"Processing WARC files in {warc_files}")
|
||||||
|
warc2zim_args.append(str(warc_files))
|
||||||
|
|
||||||
|
num_files = sum(1 for _ in warc_files.iterdir())
|
||||||
|
print(f"{num_files} WARC files found", flush=True)
|
||||||
|
|
||||||
return warc2zim(warc2zim_args)
|
return warc2zim(warc2zim_args)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user