diff --git a/CHANGELOG.md b/CHANGELOG.md index f6d7044..308058b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Change solution to report partial ZIM to the Zimfarm and other clients (#304) +- Keep temporary folder when crawler or warc2zim fails, even if not asked for (#468) ### Fixed diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index 416bec9..70dfdbd 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -33,6 +33,8 @@ from zimit.constants import ( ) from zimit.utils import download_file +temp_root_dir: Path | None = None + class ProgressFileWatcher: def __init__(self, output_dir: Path, stats_path: Path): @@ -103,6 +105,24 @@ class ProgressFileWatcher: json.dump(out, ofh) +def cleanup(): + if not temp_root_dir: + logger.warning("Temporary root dir not already set, cannot clean this up") + return + logger.info("") + logger.info("----------") + logger.info(f"Cleanup, removing temp dir: {temp_root_dir}") + shutil.rmtree(temp_root_dir) + + +def cancel_cleanup(): + logger.info( + f"Temporary files have been kept in {temp_root_dir}, please clean them" + " up manually once you don't need them anymore" + ) + atexit.unregister(cleanup) + + def run(raw_args): parser = ArgumentParser( description="Run a browser-based crawl on the specified URL and convert to ZIM" @@ -427,19 +447,13 @@ def run(raw_args): return EXIT_CODE_WARC2ZIM_CHECK_FAILED # make temp dir for this crawl + global temp_root_dir # noqa: PLW0603 if zimit_args.build: temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.build, prefix=".tmp")) else: temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp")) if not zimit_args.keep: - - def cleanup(): - logger.info("") - logger.info("----------") - logger.info(f"Cleanup, removing temp dir: {temp_root_dir}") - shutil.rmtree(temp_root_dir) - atexit.register(cleanup) # copy / download custom behaviors to one single folder and configure crawler @@ -599,6 +613,7 @@ def run(raw_args): logger.error( f"Crawl returned an error: {crawl.returncode}, scraper exiting" ) + cancel_cleanup() return crawl.returncode if zimit_args.collection: @@ -642,6 +657,11 @@ def run(raw_args): stats_content["partialZim"] = partial_zim stats.write_text(json.dumps(stats_content)) + # also call cancel_cleanup when --keep, even if it is not supposed to be registered, + # so that we will display temporary files location just like in other situations + if warc2zim_exit_code or zimit_args.keep: + cancel_cleanup() + return warc2zim_exit_code