mirror of
https://github.com/openzim/zimit.git
synced 2025-08-03 10:16:08 -04:00
Better processing of crawler exit codes with soft/hard limits
This commit is contained in:
parent
3a7f583a96
commit
101fb71a0b
22
.github/workflows/Tests.yaml
vendored
22
.github/workflows/Tests.yaml
vendored
@ -57,13 +57,25 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: build image
|
||||
run: docker build -t zimit .
|
||||
run: docker build -t local-zimit .
|
||||
|
||||
- name: ensure help display without issue
|
||||
run: docker run -v $PWD/output:/output zimit zimit --help
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --help
|
||||
|
||||
- name: run crawl
|
||||
run: docker run -v $PWD/output:/output zimit zimit --url http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
|
||||
- name: run crawl with soft size limit
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --sizeSoftLimit 8192 --name tests_en_sizesoftlimit --zim-file tests_en_sizesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_sizesoftlimit.json
|
||||
|
||||
- name: run crawl with hard size limit
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --sizeHardLimit 8192 --name tests_en_sizehardlimit --zim-file tests_en_sizehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_sizehardlimit.json || true
|
||||
|
||||
- name: run crawl with soft time limit
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --timeSoftLimit 1 --name tests_en_timesoftlimit --zim-file tests_en_timesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_timesoftlimit.json
|
||||
|
||||
- name: run crawl with hard time limit
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --timeHardLimit 1 --name tests_en_timehardlimit --zim-file tests_en_timehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_timehardlimit.json || true
|
||||
|
||||
- name: run standard crawl
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
|
||||
|
||||
- name: run integration test suite
|
||||
run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py"
|
||||
run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output local-zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py"
|
||||
|
@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Changed
|
||||
|
||||
- Change solution to report partial ZIM to the Zimfarm and other clients (#304)
|
||||
|
||||
### Fixed
|
||||
|
||||
- Do not create the ZIM when crawl is incomplete (#444)
|
||||
|
||||
## [2.1.8] - 2024-02-07
|
||||
|
||||
### Changed
|
||||
|
@ -3,7 +3,8 @@ import logging
|
||||
from zimscraperlib.logging import getLogger
|
||||
|
||||
EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
|
||||
EXIT_CODE_CRAWLER_LIMIT_HIT = 11
|
||||
EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT = 14
|
||||
EXIT_CODE_CRAWLER_TIME_LIMIT_HIT = 15
|
||||
NORMAL_WARC2ZIM_EXIT_CODE = 100
|
||||
REQUESTS_TIMEOUT = 10
|
||||
|
||||
|
@ -25,7 +25,8 @@ from zimscraperlib.uri import rebuild_uri
|
||||
|
||||
from zimit.__about__ import __version__
|
||||
from zimit.constants import (
|
||||
EXIT_CODE_CRAWLER_LIMIT_HIT,
|
||||
EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT,
|
||||
EXIT_CODE_CRAWLER_TIME_LIMIT_HIT,
|
||||
EXIT_CODE_WARC2ZIM_CHECK_FAILED,
|
||||
NORMAL_WARC2ZIM_EXIT_CODE,
|
||||
logger,
|
||||
@ -61,35 +62,19 @@ class ProgressFileWatcher:
|
||||
self.process.daemon = True
|
||||
self.process.start()
|
||||
|
||||
@staticmethod
|
||||
def inotify_watcher(crawl_fpath: str, warc2zim_fpath: str, output_fpath: str):
|
||||
def inotify_watcher(self, crawl_fpath: str, warc2zim_fpath: str, output_fpath: str):
|
||||
ino = inotify.adapters.Inotify()
|
||||
ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY) # pyright: ignore
|
||||
ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY) # pyright: ignore
|
||||
|
||||
class Limit:
|
||||
def __init__(self):
|
||||
self.max = self.hit = None
|
||||
|
||||
@property
|
||||
def as_dict(self):
|
||||
return {"max": self.max, "hit": self.hit}
|
||||
|
||||
# limit is only reported by crawl but needs to be reported up
|
||||
limit = Limit()
|
||||
|
||||
def crawl_conv(data, limit):
|
||||
def crawl_conv(data):
|
||||
# we consider crawl to be 90% of the workload so total = craw_total * 90%
|
||||
# limit = {"max": data["limit"]["max"], "hit": data["limit"]["hit"]}
|
||||
limit.max = data["limit"]["max"]
|
||||
limit.hit = data["limit"]["hit"]
|
||||
return {
|
||||
"done": data["crawled"],
|
||||
"total": int(data["total"] / 0.9),
|
||||
"limit": limit.as_dict,
|
||||
}
|
||||
|
||||
def warc2zim_conv(data, limit):
|
||||
def warc2zim_conv(data):
|
||||
# we consider warc2zim to be 10% of the workload so
|
||||
# warc2zim_total = 10% and total = 90 + warc2zim_total * 10%
|
||||
return {
|
||||
@ -98,7 +83,6 @@ class ProgressFileWatcher:
|
||||
* (0.9 + (float(data["written"]) / data["total"]) / 10)
|
||||
),
|
||||
"total": data["total"],
|
||||
"limit": limit.as_dict,
|
||||
}
|
||||
|
||||
for _, _, fpath, _ in ino.event_gen(yield_nones=False): # pyright: ignore
|
||||
@ -108,7 +92,7 @@ class ProgressFileWatcher:
|
||||
# open input and output separatly as to not clear output on error
|
||||
with open(fpath) as ifh:
|
||||
try:
|
||||
out = func(json.load(ifh), limit)
|
||||
out = func(json.load(ifh))
|
||||
except Exception: # nosec # noqa: S112
|
||||
# simply ignore progress update should an error arise
|
||||
# might be malformed input for instance
|
||||
@ -278,9 +262,17 @@ def run(raw_args):
|
||||
"directory",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--sizeLimit",
|
||||
help="If set, save state and exit if size limit exceeds this value",
|
||||
size_group = parser.add_mutually_exclusive_group()
|
||||
size_group.add_argument(
|
||||
"--sizeSoftLimit",
|
||||
help="If set, save crawl state and stop crawl if WARC size exceeds this value. "
|
||||
"ZIM will still be created.",
|
||||
type=int,
|
||||
)
|
||||
size_group.add_argument(
|
||||
"--sizeHardLimit",
|
||||
help="If set, exit crawler and fail the scraper immediately if WARC size "
|
||||
"exceeds this value",
|
||||
type=int,
|
||||
)
|
||||
|
||||
@ -292,9 +284,17 @@ def run(raw_args):
|
||||
default=90,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--timeLimit",
|
||||
help="If set, save state and exit after time limit, in seconds",
|
||||
time_group = parser.add_mutually_exclusive_group()
|
||||
time_group.add_argument(
|
||||
"--timeSoftLimit",
|
||||
help="If set, save crawl state and stop crawl if WARC WARC(s) creation takes "
|
||||
"longer than this value, in seconds. ZIM will still be created.",
|
||||
type=int,
|
||||
)
|
||||
time_group.add_argument(
|
||||
"--timeHardLimit",
|
||||
help="If set, exit crawler and fail the scraper immediately if WARC(s) creation"
|
||||
" takes longer than this value, in seconds",
|
||||
type=int,
|
||||
)
|
||||
|
||||
@ -369,6 +369,13 @@ def run(raw_args):
|
||||
"path/URLs separated by comma",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--acceptable-crawler-exit-codes",
|
||||
help="Non-zero crawler exit codes to consider as acceptable to continue with "
|
||||
" conversion of WARC to ZIM. Flag partialZim will be set in statsFilename (if "
|
||||
" used). Single value with individual error codes separated by comma",
|
||||
)
|
||||
|
||||
zimit_args, warc2zim_args = parser.parse_known_args(raw_args)
|
||||
|
||||
# pass a scraper suffix to warc2zim so that both zimit and warc2zim versions are
|
||||
@ -504,6 +511,8 @@ def run(raw_args):
|
||||
f"{'will keep' if zimit_args.keep else 'will delete'}"
|
||||
)
|
||||
|
||||
partial_zim = False
|
||||
|
||||
# if warc files are passed, do not run browsertrix crawler but fetch the files if
|
||||
# they are provided as an HTTP URL + extract the archive if it is a tar.gz
|
||||
warc_files: list[Path] = []
|
||||
@ -568,10 +577,29 @@ def run(raw_args):
|
||||
|
||||
logger.info(f"Running browsertrix-crawler crawl: {cmd_line}")
|
||||
crawl = subprocess.run(cmd_args, check=False)
|
||||
if crawl.returncode == EXIT_CODE_CRAWLER_LIMIT_HIT:
|
||||
logger.info("crawl interupted by a limit")
|
||||
if (
|
||||
crawl.returncode == EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT
|
||||
and zimit_args.sizeSoftLimit
|
||||
):
|
||||
logger.info(
|
||||
"Crawl size soft limit hit. Continuing with warc2zim conversion."
|
||||
)
|
||||
if zimit_args.statsFilename:
|
||||
partial_zim = True
|
||||
elif (
|
||||
crawl.returncode == EXIT_CODE_CRAWLER_TIME_LIMIT_HIT
|
||||
and zimit_args.timeSoftLimit
|
||||
):
|
||||
logger.info(
|
||||
"Crawl time soft limit hit. Continuing with warc2zim conversion."
|
||||
)
|
||||
if zimit_args.statsFilename:
|
||||
partial_zim = True
|
||||
elif crawl.returncode != 0:
|
||||
raise subprocess.CalledProcessError(crawl.returncode, cmd_args)
|
||||
logger.error(
|
||||
f"Crawl returned an error: {crawl.returncode}, scraper exiting"
|
||||
)
|
||||
return crawl.returncode
|
||||
|
||||
if zimit_args.collection:
|
||||
warc_files = [
|
||||
@ -606,7 +634,15 @@ def run(raw_args):
|
||||
|
||||
logger.info(f"Calling warc2zim with these args: {warc2zim_args}")
|
||||
|
||||
return warc2zim(warc2zim_args)
|
||||
warc2zim_exit_code = warc2zim(warc2zim_args)
|
||||
|
||||
if zimit_args.statsFilename:
|
||||
stats = Path(zimit_args.statsFilename)
|
||||
stats_content = json.loads(stats.read_bytes())
|
||||
stats_content["partialZim"] = partial_zim
|
||||
stats.write_text(json.dumps(stats_content))
|
||||
|
||||
return warc2zim_exit_code
|
||||
|
||||
|
||||
def get_cleaned_url(url: str):
|
||||
@ -646,9 +682,11 @@ def get_node_cmd_line(args):
|
||||
"behaviorTimeout",
|
||||
"delay",
|
||||
"profile",
|
||||
"sizeLimit",
|
||||
"sizeSoftLimit",
|
||||
"sizeHardLimit",
|
||||
"diskUtilization",
|
||||
"timeLimit",
|
||||
"timeSoftLimit",
|
||||
"timeHardLimit",
|
||||
"healthCheckPort",
|
||||
"overwrite",
|
||||
"config",
|
||||
@ -668,7 +706,14 @@ def get_node_cmd_line(args):
|
||||
continue
|
||||
if value is None or (isinstance(value, bool) and value is False):
|
||||
continue
|
||||
node_cmd.append("--" + arg)
|
||||
node_cmd.append(
|
||||
"--"
|
||||
+ (
|
||||
"sizeLimit"
|
||||
if arg in ["sizeSoftLimit", "sizeHardLimit"]
|
||||
else "timeLimit" if arg in ["timeSoftLimit", "timeHardLimit"] else arg
|
||||
)
|
||||
)
|
||||
if not isinstance(value, bool):
|
||||
node_cmd.append(str(value))
|
||||
|
||||
|
@ -3,13 +3,34 @@ import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from warcio import ArchiveIterator
|
||||
from zimscraperlib.zim import Archive
|
||||
|
||||
|
||||
def test_is_file():
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
[
|
||||
pytest.param("/output/tests_en_onepage.zim", id="onepage"),
|
||||
pytest.param("/output/tests_en_sizesoftlimit.zim", id="sizesoftlimit"),
|
||||
pytest.param("/output/tests_en_timesoftlimit.zim", id="timesoftlimit"),
|
||||
],
|
||||
)
|
||||
def test_zim_created(filename):
|
||||
"""Ensure ZIM file exists"""
|
||||
assert os.path.isfile("/output/tests_en_onepage.zim")
|
||||
assert os.path.isfile(filename)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
[
|
||||
pytest.param("/output/tests_en_sizehardlimit.zim", id="sizehardlimit"),
|
||||
pytest.param("/output/tests_en_timehardlimit.zim", id="timehardlimit"),
|
||||
],
|
||||
)
|
||||
def test_zim_not_created(filename):
|
||||
"""Ensure ZIM file does not exists"""
|
||||
assert not os.path.exists(filename)
|
||||
|
||||
|
||||
def test_zim_main_page():
|
||||
@ -85,7 +106,7 @@ def test_user_agent():
|
||||
assert found
|
||||
|
||||
|
||||
def test_stats_output():
|
||||
def test_stats_output_standard():
|
||||
assert json.loads(Path("/output/crawl.json").read_bytes()) == {
|
||||
"crawled": 17,
|
||||
"pending": 0,
|
||||
@ -103,5 +124,22 @@ def test_stats_output():
|
||||
assert json.loads(Path("/output/stats.json").read_bytes()) == {
|
||||
"done": 8,
|
||||
"total": 8,
|
||||
"limit": {"max": 0, "hit": False},
|
||||
"partialZim": False,
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
[
|
||||
pytest.param("/output/stats_sizesoftlimit.json", id="sizesoftlimit"),
|
||||
pytest.param("/output/stats_timesoftlimit.json", id="timesoftlimit"),
|
||||
],
|
||||
)
|
||||
def test_stats_output_softlimit(filename):
|
||||
file = Path(filename)
|
||||
assert file.exists
|
||||
content = json.loads(file.read_bytes())
|
||||
assert "done" in content
|
||||
assert "total" in content
|
||||
assert "partialZim" in content
|
||||
assert content["partialZim"]
|
||||
|
Loading…
x
Reference in New Issue
Block a user