Better processing of crawler exit codes with soft/hard limits

This commit is contained in:
benoit74 2025-02-11 16:57:19 +00:00
parent 3a7f583a96
commit 101fb71a0b
No known key found for this signature in database
GPG Key ID: B89606434FC7B530
5 changed files with 149 additions and 45 deletions

View File

@ -57,13 +57,25 @@ jobs:
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: build image - name: build image
run: docker build -t zimit . run: docker build -t local-zimit .
- name: ensure help display without issue - name: ensure help display without issue
run: docker run -v $PWD/output:/output zimit zimit --help run: docker run -v $PWD/output:/output local-zimit zimit --help
- name: run crawl - name: run crawl with soft size limit
run: docker run -v $PWD/output:/output zimit zimit --url http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --sizeSoftLimit 8192 --name tests_en_sizesoftlimit --zim-file tests_en_sizesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_sizesoftlimit.json
- name: run crawl with hard size limit
run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --sizeHardLimit 8192 --name tests_en_sizehardlimit --zim-file tests_en_sizehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_sizehardlimit.json || true
- name: run crawl with soft time limit
run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --timeSoftLimit 1 --name tests_en_timesoftlimit --zim-file tests_en_timesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_timesoftlimit.json
- name: run crawl with hard time limit
run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --timeHardLimit 1 --name tests_en_timehardlimit --zim-file tests_en_timehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_timehardlimit.json || true
- name: run standard crawl
run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
- name: run integration test suite - name: run integration test suite
run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py" run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output local-zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py"

View File

@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased] ## [Unreleased]
### Changed
- Change solution to report partial ZIM to the Zimfarm and other clients (#304)
### Fixed
- Do not create the ZIM when crawl is incomplete (#444)
## [2.1.8] - 2024-02-07 ## [2.1.8] - 2024-02-07
### Changed ### Changed

View File

@ -3,7 +3,8 @@ import logging
from zimscraperlib.logging import getLogger from zimscraperlib.logging import getLogger
EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2 EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
EXIT_CODE_CRAWLER_LIMIT_HIT = 11 EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT = 14
EXIT_CODE_CRAWLER_TIME_LIMIT_HIT = 15
NORMAL_WARC2ZIM_EXIT_CODE = 100 NORMAL_WARC2ZIM_EXIT_CODE = 100
REQUESTS_TIMEOUT = 10 REQUESTS_TIMEOUT = 10

View File

@ -25,7 +25,8 @@ from zimscraperlib.uri import rebuild_uri
from zimit.__about__ import __version__ from zimit.__about__ import __version__
from zimit.constants import ( from zimit.constants import (
EXIT_CODE_CRAWLER_LIMIT_HIT, EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT,
EXIT_CODE_CRAWLER_TIME_LIMIT_HIT,
EXIT_CODE_WARC2ZIM_CHECK_FAILED, EXIT_CODE_WARC2ZIM_CHECK_FAILED,
NORMAL_WARC2ZIM_EXIT_CODE, NORMAL_WARC2ZIM_EXIT_CODE,
logger, logger,
@ -61,35 +62,19 @@ class ProgressFileWatcher:
self.process.daemon = True self.process.daemon = True
self.process.start() self.process.start()
@staticmethod def inotify_watcher(self, crawl_fpath: str, warc2zim_fpath: str, output_fpath: str):
def inotify_watcher(crawl_fpath: str, warc2zim_fpath: str, output_fpath: str):
ino = inotify.adapters.Inotify() ino = inotify.adapters.Inotify()
ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY) # pyright: ignore ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY) # pyright: ignore
ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY) # pyright: ignore ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY) # pyright: ignore
class Limit: def crawl_conv(data):
def __init__(self):
self.max = self.hit = None
@property
def as_dict(self):
return {"max": self.max, "hit": self.hit}
# limit is only reported by crawl but needs to be reported up
limit = Limit()
def crawl_conv(data, limit):
# we consider crawl to be 90% of the workload so total = craw_total * 90% # we consider crawl to be 90% of the workload so total = craw_total * 90%
# limit = {"max": data["limit"]["max"], "hit": data["limit"]["hit"]}
limit.max = data["limit"]["max"]
limit.hit = data["limit"]["hit"]
return { return {
"done": data["crawled"], "done": data["crawled"],
"total": int(data["total"] / 0.9), "total": int(data["total"] / 0.9),
"limit": limit.as_dict,
} }
def warc2zim_conv(data, limit): def warc2zim_conv(data):
# we consider warc2zim to be 10% of the workload so # we consider warc2zim to be 10% of the workload so
# warc2zim_total = 10% and total = 90 + warc2zim_total * 10% # warc2zim_total = 10% and total = 90 + warc2zim_total * 10%
return { return {
@ -98,7 +83,6 @@ class ProgressFileWatcher:
* (0.9 + (float(data["written"]) / data["total"]) / 10) * (0.9 + (float(data["written"]) / data["total"]) / 10)
), ),
"total": data["total"], "total": data["total"],
"limit": limit.as_dict,
} }
for _, _, fpath, _ in ino.event_gen(yield_nones=False): # pyright: ignore for _, _, fpath, _ in ino.event_gen(yield_nones=False): # pyright: ignore
@ -108,7 +92,7 @@ class ProgressFileWatcher:
# open input and output separatly as to not clear output on error # open input and output separatly as to not clear output on error
with open(fpath) as ifh: with open(fpath) as ifh:
try: try:
out = func(json.load(ifh), limit) out = func(json.load(ifh))
except Exception: # nosec # noqa: S112 except Exception: # nosec # noqa: S112
# simply ignore progress update should an error arise # simply ignore progress update should an error arise
# might be malformed input for instance # might be malformed input for instance
@ -278,9 +262,17 @@ def run(raw_args):
"directory", "directory",
) )
parser.add_argument( size_group = parser.add_mutually_exclusive_group()
"--sizeLimit", size_group.add_argument(
help="If set, save state and exit if size limit exceeds this value", "--sizeSoftLimit",
help="If set, save crawl state and stop crawl if WARC size exceeds this value. "
"ZIM will still be created.",
type=int,
)
size_group.add_argument(
"--sizeHardLimit",
help="If set, exit crawler and fail the scraper immediately if WARC size "
"exceeds this value",
type=int, type=int,
) )
@ -292,9 +284,17 @@ def run(raw_args):
default=90, default=90,
) )
parser.add_argument( time_group = parser.add_mutually_exclusive_group()
"--timeLimit", time_group.add_argument(
help="If set, save state and exit after time limit, in seconds", "--timeSoftLimit",
help="If set, save crawl state and stop crawl if WARC WARC(s) creation takes "
"longer than this value, in seconds. ZIM will still be created.",
type=int,
)
time_group.add_argument(
"--timeHardLimit",
help="If set, exit crawler and fail the scraper immediately if WARC(s) creation"
" takes longer than this value, in seconds",
type=int, type=int,
) )
@ -369,6 +369,13 @@ def run(raw_args):
"path/URLs separated by comma", "path/URLs separated by comma",
) )
parser.add_argument(
"--acceptable-crawler-exit-codes",
help="Non-zero crawler exit codes to consider as acceptable to continue with "
" conversion of WARC to ZIM. Flag partialZim will be set in statsFilename (if "
" used). Single value with individual error codes separated by comma",
)
zimit_args, warc2zim_args = parser.parse_known_args(raw_args) zimit_args, warc2zim_args = parser.parse_known_args(raw_args)
# pass a scraper suffix to warc2zim so that both zimit and warc2zim versions are # pass a scraper suffix to warc2zim so that both zimit and warc2zim versions are
@ -504,6 +511,8 @@ def run(raw_args):
f"{'will keep' if zimit_args.keep else 'will delete'}" f"{'will keep' if zimit_args.keep else 'will delete'}"
) )
partial_zim = False
# if warc files are passed, do not run browsertrix crawler but fetch the files if # if warc files are passed, do not run browsertrix crawler but fetch the files if
# they are provided as an HTTP URL + extract the archive if it is a tar.gz # they are provided as an HTTP URL + extract the archive if it is a tar.gz
warc_files: list[Path] = [] warc_files: list[Path] = []
@ -568,10 +577,29 @@ def run(raw_args):
logger.info(f"Running browsertrix-crawler crawl: {cmd_line}") logger.info(f"Running browsertrix-crawler crawl: {cmd_line}")
crawl = subprocess.run(cmd_args, check=False) crawl = subprocess.run(cmd_args, check=False)
if crawl.returncode == EXIT_CODE_CRAWLER_LIMIT_HIT: if (
logger.info("crawl interupted by a limit") crawl.returncode == EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT
and zimit_args.sizeSoftLimit
):
logger.info(
"Crawl size soft limit hit. Continuing with warc2zim conversion."
)
if zimit_args.statsFilename:
partial_zim = True
elif (
crawl.returncode == EXIT_CODE_CRAWLER_TIME_LIMIT_HIT
and zimit_args.timeSoftLimit
):
logger.info(
"Crawl time soft limit hit. Continuing with warc2zim conversion."
)
if zimit_args.statsFilename:
partial_zim = True
elif crawl.returncode != 0: elif crawl.returncode != 0:
raise subprocess.CalledProcessError(crawl.returncode, cmd_args) logger.error(
f"Crawl returned an error: {crawl.returncode}, scraper exiting"
)
return crawl.returncode
if zimit_args.collection: if zimit_args.collection:
warc_files = [ warc_files = [
@ -606,7 +634,15 @@ def run(raw_args):
logger.info(f"Calling warc2zim with these args: {warc2zim_args}") logger.info(f"Calling warc2zim with these args: {warc2zim_args}")
return warc2zim(warc2zim_args) warc2zim_exit_code = warc2zim(warc2zim_args)
if zimit_args.statsFilename:
stats = Path(zimit_args.statsFilename)
stats_content = json.loads(stats.read_bytes())
stats_content["partialZim"] = partial_zim
stats.write_text(json.dumps(stats_content))
return warc2zim_exit_code
def get_cleaned_url(url: str): def get_cleaned_url(url: str):
@ -646,9 +682,11 @@ def get_node_cmd_line(args):
"behaviorTimeout", "behaviorTimeout",
"delay", "delay",
"profile", "profile",
"sizeLimit", "sizeSoftLimit",
"sizeHardLimit",
"diskUtilization", "diskUtilization",
"timeLimit", "timeSoftLimit",
"timeHardLimit",
"healthCheckPort", "healthCheckPort",
"overwrite", "overwrite",
"config", "config",
@ -668,7 +706,14 @@ def get_node_cmd_line(args):
continue continue
if value is None or (isinstance(value, bool) and value is False): if value is None or (isinstance(value, bool) and value is False):
continue continue
node_cmd.append("--" + arg) node_cmd.append(
"--"
+ (
"sizeLimit"
if arg in ["sizeSoftLimit", "sizeHardLimit"]
else "timeLimit" if arg in ["timeSoftLimit", "timeHardLimit"] else arg
)
)
if not isinstance(value, bool): if not isinstance(value, bool):
node_cmd.append(str(value)) node_cmd.append(str(value))

View File

@ -3,13 +3,34 @@ import json
import os import os
from pathlib import Path from pathlib import Path
import pytest
from warcio import ArchiveIterator from warcio import ArchiveIterator
from zimscraperlib.zim import Archive from zimscraperlib.zim import Archive
def test_is_file(): @pytest.mark.parametrize(
"filename",
[
pytest.param("/output/tests_en_onepage.zim", id="onepage"),
pytest.param("/output/tests_en_sizesoftlimit.zim", id="sizesoftlimit"),
pytest.param("/output/tests_en_timesoftlimit.zim", id="timesoftlimit"),
],
)
def test_zim_created(filename):
"""Ensure ZIM file exists""" """Ensure ZIM file exists"""
assert os.path.isfile("/output/tests_en_onepage.zim") assert os.path.isfile(filename)
@pytest.mark.parametrize(
"filename",
[
pytest.param("/output/tests_en_sizehardlimit.zim", id="sizehardlimit"),
pytest.param("/output/tests_en_timehardlimit.zim", id="timehardlimit"),
],
)
def test_zim_not_created(filename):
"""Ensure ZIM file does not exists"""
assert not os.path.exists(filename)
def test_zim_main_page(): def test_zim_main_page():
@ -85,7 +106,7 @@ def test_user_agent():
assert found assert found
def test_stats_output(): def test_stats_output_standard():
assert json.loads(Path("/output/crawl.json").read_bytes()) == { assert json.loads(Path("/output/crawl.json").read_bytes()) == {
"crawled": 17, "crawled": 17,
"pending": 0, "pending": 0,
@ -103,5 +124,22 @@ def test_stats_output():
assert json.loads(Path("/output/stats.json").read_bytes()) == { assert json.loads(Path("/output/stats.json").read_bytes()) == {
"done": 8, "done": 8,
"total": 8, "total": 8,
"limit": {"max": 0, "hit": False}, "partialZim": False,
} }
@pytest.mark.parametrize(
"filename",
[
pytest.param("/output/stats_sizesoftlimit.json", id="sizesoftlimit"),
pytest.param("/output/stats_timesoftlimit.json", id="timesoftlimit"),
],
)
def test_stats_output_softlimit(filename):
file = Path(filename)
assert file.exists
content = json.loads(file.read_bytes())
assert "done" in content
assert "total" in content
assert "partialZim" in content
assert content["partialZim"]