Keep temporary folder when crawler or warc2zim fails, even if not asked for

This commit is contained in:
benoit74 2025-02-13 13:18:06 +00:00
parent bc73193ce0
commit ee82837aaa
No known key found for this signature in database
GPG Key ID: B89606434FC7B530
2 changed files with 28 additions and 7 deletions

View File

@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed ### Changed
- Change solution to report partial ZIM to the Zimfarm and other clients (#304) - Change solution to report partial ZIM to the Zimfarm and other clients (#304)
- Keep temporary folder when crawler or warc2zim fails, even if not asked for (#468)
### Fixed ### Fixed

View File

@ -33,6 +33,8 @@ from zimit.constants import (
) )
from zimit.utils import download_file from zimit.utils import download_file
temp_root_dir: Path | None = None
class ProgressFileWatcher: class ProgressFileWatcher:
def __init__(self, output_dir: Path, stats_path: Path): def __init__(self, output_dir: Path, stats_path: Path):
@ -103,6 +105,24 @@ class ProgressFileWatcher:
json.dump(out, ofh) json.dump(out, ofh)
def cleanup():
if not temp_root_dir:
logger.warning("Temporary root dir not already set, cannot clean this up")
return
logger.info("")
logger.info("----------")
logger.info(f"Cleanup, removing temp dir: {temp_root_dir}")
shutil.rmtree(temp_root_dir)
def cancel_cleanup():
logger.info(
f"Temporary files have been kept in {temp_root_dir}, please clean them"
" up manually once you don't need them anymore"
)
atexit.unregister(cleanup)
def run(raw_args): def run(raw_args):
parser = ArgumentParser( parser = ArgumentParser(
description="Run a browser-based crawl on the specified URL and convert to ZIM" description="Run a browser-based crawl on the specified URL and convert to ZIM"
@ -427,19 +447,13 @@ def run(raw_args):
return EXIT_CODE_WARC2ZIM_CHECK_FAILED return EXIT_CODE_WARC2ZIM_CHECK_FAILED
# make temp dir for this crawl # make temp dir for this crawl
global temp_root_dir # noqa: PLW0603
if zimit_args.build: if zimit_args.build:
temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.build, prefix=".tmp")) temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.build, prefix=".tmp"))
else: else:
temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp")) temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp"))
if not zimit_args.keep: if not zimit_args.keep:
def cleanup():
logger.info("")
logger.info("----------")
logger.info(f"Cleanup, removing temp dir: {temp_root_dir}")
shutil.rmtree(temp_root_dir)
atexit.register(cleanup) atexit.register(cleanup)
# copy / download custom behaviors to one single folder and configure crawler # copy / download custom behaviors to one single folder and configure crawler
@ -599,6 +613,7 @@ def run(raw_args):
logger.error( logger.error(
f"Crawl returned an error: {crawl.returncode}, scraper exiting" f"Crawl returned an error: {crawl.returncode}, scraper exiting"
) )
cancel_cleanup()
return crawl.returncode return crawl.returncode
if zimit_args.collection: if zimit_args.collection:
@ -642,6 +657,11 @@ def run(raw_args):
stats_content["partialZim"] = partial_zim stats_content["partialZim"] = partial_zim
stats.write_text(json.dumps(stats_content)) stats.write_text(json.dumps(stats_content))
# also call cancel_cleanup when --keep, even if it is not supposed to be registered,
# so that we will display temporary files location just like in other situations
if warc2zim_exit_code or zimit_args.keep:
cancel_cleanup()
return warc2zim_exit_code return warc2zim_exit_code