Merge pull request #346 from openzim/custom_behaviors

Add support for custom behaviors configuration
This commit is contained in:
benoit74 2024-08-07 11:31:57 +02:00 committed by GitHub
commit ea7653ef37
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 43 additions and 0 deletions

View File

@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed ### Changed
- Make it clear that `--profile` argument can be an HTTP(S) URL (and not only a path) (#288) - Make it clear that `--profile` argument can be an HTTP(S) URL (and not only a path) (#288)
- Add `--custom-behaviors` argument to support path/HTTP(S) URL custom behaviors to pass to the crawler (#313)
## [2.0.6] - 2024-08-02 ## [2.0.6] - 2024-08-02

View File

@ -7,6 +7,7 @@ and then calls the Node based driver
import atexit import atexit
import json import json
import logging import logging
import re
import shutil import shutil
import signal import signal
import subprocess import subprocess
@ -19,6 +20,7 @@ from pathlib import Path
import inotify import inotify
import inotify.adapters import inotify.adapters
import requests
from warc2zim.main import main as warc2zim from warc2zim.main import main as warc2zim
from zimscraperlib.logging import getLogger from zimscraperlib.logging import getLogger
from zimscraperlib.uri import rebuild_uri from zimscraperlib.uri import rebuild_uri
@ -28,6 +30,7 @@ from zimit.__about__ import __version__
EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2 EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
EXIT_CODE_CRAWLER_LIMIT_HIT = 11 EXIT_CODE_CRAWLER_LIMIT_HIT = 11
NORMAL_WARC2ZIM_EXIT_CODE = 100 NORMAL_WARC2ZIM_EXIT_CODE = 100
REQUESTS_TIMEOUT = 10
logger = getLogger(name="zimit", level=logging.INFO) logger = getLogger(name="zimit", level=logging.INFO)
@ -354,6 +357,12 @@ def run(raw_args):
help="Crawler logging configuration", help="Crawler logging configuration",
) )
parser.add_argument(
"--custom-behaviors",
help="JS code for custom behaviors to customize crawler. Single string with "
"individual JS files URL/path separated by a comma",
)
zimit_args, warc2zim_args = parser.parse_known_args(raw_args) zimit_args, warc2zim_args = parser.parse_known_args(raw_args)
logger.info("Checking browsertrix-crawler version") logger.info("Checking browsertrix-crawler version")
@ -434,6 +443,38 @@ def run(raw_args):
atexit.register(cleanup) atexit.register(cleanup)
# copy / download custom behaviors to one single folder and configure crawler
if zimit_args.custom_behaviors:
behaviors_dir = temp_root_dir / "custom-behaviors"
behaviors_dir.mkdir()
for custom_behavior in [
custom_behavior.strip()
for custom_behavior in zimit_args.custom_behaviors.split(",")
]:
behaviors_file = tempfile.NamedTemporaryFile(
dir=behaviors_dir,
prefix="behavior_",
suffix=".js",
delete_on_close=False,
)
if re.match(r"^https?\://", custom_behavior):
logger.info(
f"Downloading browser profile from {custom_behavior} "
f"to {behaviors_file.name}"
)
resp = requests.get(custom_behavior, timeout=REQUESTS_TIMEOUT)
resp.raise_for_status()
Path(behaviors_file.name).write_bytes(resp.content)
else:
logger.info(
f"Copying browser profile from {custom_behavior} "
f"to {behaviors_file.name}"
)
shutil.copy(custom_behavior, behaviors_file.name)
zimit_args.customBehaviors = str(behaviors_dir)
else:
zimit_args.customBehaviors = None
cmd_args = get_node_cmd_line(zimit_args) cmd_args = get_node_cmd_line(zimit_args)
if url: if url:
cmd_args.append("--url") cmd_args.append("--url")
@ -551,6 +592,7 @@ def get_node_cmd_line(args):
"overwrite", "overwrite",
"config", "config",
"logging", "logging",
"customBehaviors",
]: ]:
value = getattr(args, arg) value = getattr(args, arg)
if arg == "userAgent": if arg == "userAgent":