mirror of
https://github.com/openzim/zimit.git
synced 2025-09-27 06:43:38 -04:00
Merge pull request #470 from openzim/keep_tmp_folder
Keep temporary folder when crawler or warc2zim fails, even if not asked for
This commit is contained in:
commit
4f9085b10e
@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
### Changed
|
### Changed
|
||||||
|
|
||||||
- Change solution to report partial ZIM to the Zimfarm and other clients (#304)
|
- Change solution to report partial ZIM to the Zimfarm and other clients (#304)
|
||||||
|
- Keep temporary folder when crawler or warc2zim fails, even if not asked for (#468)
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
|
|
||||||
|
@ -48,7 +48,7 @@ The image accepts the following parameters, **as well as any of the [warc2zim](h
|
|||||||
- `--exclude <regex>` - skip URLs that match the regex from crawling. Can be specified multiple times. An example is `--exclude="(\?q=|signup-landing\?|\?cid=)"`, where URLs that contain either `?q=` or `signup-landing?` or `?cid=` will be excluded.
|
- `--exclude <regex>` - skip URLs that match the regex from crawling. Can be specified multiple times. An example is `--exclude="(\?q=|signup-landing\?|\?cid=)"`, where URLs that contain either `?q=` or `signup-landing?` or `?cid=` will be excluded.
|
||||||
- `--workers N` - number of crawl workers to be run in parallel
|
- `--workers N` - number of crawl workers to be run in parallel
|
||||||
- `--wait-until` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example).
|
- `--wait-until` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example).
|
||||||
- `--keep` - if set, keep the WARC files in a temp directory inside the output directory
|
- `--keep` - in case of failure, WARC files and other temporary files (which are stored as a subfolder of output directory) are always kept, otherwise they are automatically deleted. Use this flag to always keep WARC files, even in case of success.
|
||||||
|
|
||||||
Example command:
|
Example command:
|
||||||
|
|
||||||
|
@ -33,6 +33,8 @@ from zimit.constants import (
|
|||||||
)
|
)
|
||||||
from zimit.utils import download_file
|
from zimit.utils import download_file
|
||||||
|
|
||||||
|
temp_root_dir: Path | None = None
|
||||||
|
|
||||||
|
|
||||||
class ProgressFileWatcher:
|
class ProgressFileWatcher:
|
||||||
def __init__(self, output_dir: Path, stats_path: Path):
|
def __init__(self, output_dir: Path, stats_path: Path):
|
||||||
@ -103,6 +105,24 @@ class ProgressFileWatcher:
|
|||||||
json.dump(out, ofh)
|
json.dump(out, ofh)
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup():
|
||||||
|
if not temp_root_dir:
|
||||||
|
logger.warning("Temporary root dir not already set, cannot clean this up")
|
||||||
|
return
|
||||||
|
logger.info("")
|
||||||
|
logger.info("----------")
|
||||||
|
logger.info(f"Cleanup, removing temp dir: {temp_root_dir}")
|
||||||
|
shutil.rmtree(temp_root_dir)
|
||||||
|
|
||||||
|
|
||||||
|
def cancel_cleanup():
|
||||||
|
logger.info(
|
||||||
|
f"Temporary files have been kept in {temp_root_dir}, please clean them"
|
||||||
|
" up manually once you don't need them anymore"
|
||||||
|
)
|
||||||
|
atexit.unregister(cleanup)
|
||||||
|
|
||||||
|
|
||||||
def run(raw_args):
|
def run(raw_args):
|
||||||
parser = ArgumentParser(
|
parser = ArgumentParser(
|
||||||
description="Run a browser-based crawl on the specified URL and convert to ZIM"
|
description="Run a browser-based crawl on the specified URL and convert to ZIM"
|
||||||
@ -314,7 +334,10 @@ def run(raw_args):
|
|||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--keep",
|
"--keep",
|
||||||
help="If set, keep WARC files after crawl, don't delete",
|
help="In case of failure, WARC files and other temporary files (which are "
|
||||||
|
"stored as a subfolder of output directory) are always kept, otherwise "
|
||||||
|
"they are automatically deleted. Use this flag to always keep WARC files, "
|
||||||
|
"even in case of success.",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -427,19 +450,13 @@ def run(raw_args):
|
|||||||
return EXIT_CODE_WARC2ZIM_CHECK_FAILED
|
return EXIT_CODE_WARC2ZIM_CHECK_FAILED
|
||||||
|
|
||||||
# make temp dir for this crawl
|
# make temp dir for this crawl
|
||||||
|
global temp_root_dir # noqa: PLW0603
|
||||||
if zimit_args.build:
|
if zimit_args.build:
|
||||||
temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.build, prefix=".tmp"))
|
temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.build, prefix=".tmp"))
|
||||||
else:
|
else:
|
||||||
temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp"))
|
temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp"))
|
||||||
|
|
||||||
if not zimit_args.keep:
|
if not zimit_args.keep:
|
||||||
|
|
||||||
def cleanup():
|
|
||||||
logger.info("")
|
|
||||||
logger.info("----------")
|
|
||||||
logger.info(f"Cleanup, removing temp dir: {temp_root_dir}")
|
|
||||||
shutil.rmtree(temp_root_dir)
|
|
||||||
|
|
||||||
atexit.register(cleanup)
|
atexit.register(cleanup)
|
||||||
|
|
||||||
# copy / download custom behaviors to one single folder and configure crawler
|
# copy / download custom behaviors to one single folder and configure crawler
|
||||||
@ -599,6 +616,7 @@ def run(raw_args):
|
|||||||
logger.error(
|
logger.error(
|
||||||
f"Crawl returned an error: {crawl.returncode}, scraper exiting"
|
f"Crawl returned an error: {crawl.returncode}, scraper exiting"
|
||||||
)
|
)
|
||||||
|
cancel_cleanup()
|
||||||
return crawl.returncode
|
return crawl.returncode
|
||||||
|
|
||||||
if zimit_args.collection:
|
if zimit_args.collection:
|
||||||
@ -642,6 +660,11 @@ def run(raw_args):
|
|||||||
stats_content["partialZim"] = partial_zim
|
stats_content["partialZim"] = partial_zim
|
||||||
stats.write_text(json.dumps(stats_content))
|
stats.write_text(json.dumps(stats_content))
|
||||||
|
|
||||||
|
# also call cancel_cleanup when --keep, even if it is not supposed to be registered,
|
||||||
|
# so that we will display temporary files location just like in other situations
|
||||||
|
if warc2zim_exit_code or zimit_args.keep:
|
||||||
|
cancel_cleanup()
|
||||||
|
|
||||||
return warc2zim_exit_code
|
return warc2zim_exit_code
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user