diff --git a/CHANGELOG.md b/CHANGELOG.md index f6d7044..308058b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Change solution to report partial ZIM to the Zimfarm and other clients (#304) +- Keep temporary folder when crawler or warc2zim fails, even if not asked for (#468) ### Fixed diff --git a/README.md b/README.md index 9bfba9b..1598ead 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,7 @@ The image accepts the following parameters, **as well as any of the [warc2zim](h - `--exclude ` - skip URLs that match the regex from crawling. Can be specified multiple times. An example is `--exclude="(\?q=|signup-landing\?|\?cid=)"`, where URLs that contain either `?q=` or `signup-landing?` or `?cid=` will be excluded. - `--workers N` - number of crawl workers to be run in parallel - `--wait-until` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example). -- `--keep` - if set, keep the WARC files in a temp directory inside the output directory +- `--keep` - in case of failure, WARC files and other temporary files (which are stored as a subfolder of output directory) are always kept, otherwise they are automatically deleted. Use this flag to always keep WARC files, even in case of success. Example command: diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index 416bec9..49ead05 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -33,6 +33,8 @@ from zimit.constants import ( ) from zimit.utils import download_file +temp_root_dir: Path | None = None + class ProgressFileWatcher: def __init__(self, output_dir: Path, stats_path: Path): @@ -103,6 +105,24 @@ class ProgressFileWatcher: json.dump(out, ofh) +def cleanup(): + if not temp_root_dir: + logger.warning("Temporary root dir not already set, cannot clean this up") + return + logger.info("") + logger.info("----------") + logger.info(f"Cleanup, removing temp dir: {temp_root_dir}") + shutil.rmtree(temp_root_dir) + + +def cancel_cleanup(): + logger.info( + f"Temporary files have been kept in {temp_root_dir}, please clean them" + " up manually once you don't need them anymore" + ) + atexit.unregister(cleanup) + + def run(raw_args): parser = ArgumentParser( description="Run a browser-based crawl on the specified URL and convert to ZIM" @@ -314,7 +334,10 @@ def run(raw_args): parser.add_argument( "--keep", - help="If set, keep WARC files after crawl, don't delete", + help="In case of failure, WARC files and other temporary files (which are " + "stored as a subfolder of output directory) are always kept, otherwise " + "they are automatically deleted. Use this flag to always keep WARC files, " + "even in case of success.", action="store_true", ) @@ -427,19 +450,13 @@ def run(raw_args): return EXIT_CODE_WARC2ZIM_CHECK_FAILED # make temp dir for this crawl + global temp_root_dir # noqa: PLW0603 if zimit_args.build: temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.build, prefix=".tmp")) else: temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp")) if not zimit_args.keep: - - def cleanup(): - logger.info("") - logger.info("----------") - logger.info(f"Cleanup, removing temp dir: {temp_root_dir}") - shutil.rmtree(temp_root_dir) - atexit.register(cleanup) # copy / download custom behaviors to one single folder and configure crawler @@ -599,6 +616,7 @@ def run(raw_args): logger.error( f"Crawl returned an error: {crawl.returncode}, scraper exiting" ) + cancel_cleanup() return crawl.returncode if zimit_args.collection: @@ -642,6 +660,11 @@ def run(raw_args): stats_content["partialZim"] = partial_zim stats.write_text(json.dumps(stats_content)) + # also call cancel_cleanup when --keep, even if it is not supposed to be registered, + # so that we will display temporary files location just like in other situations + if warc2zim_exit_code or zimit_args.keep: + cancel_cleanup() + return warc2zim_exit_code