Merge pull request #346 from openzim/custom_behaviors

Add support for custom behaviors configuration
This commit is contained in:
benoit74 2024-08-07 11:31:57 +02:00 committed by GitHub
commit ea7653ef37
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 43 additions and 0 deletions

View File

@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed
- Make it clear that `--profile` argument can be an HTTP(S) URL (and not only a path) (#288)
- Add `--custom-behaviors` argument to support path/HTTP(S) URL custom behaviors to pass to the crawler (#313)
## [2.0.6] - 2024-08-02

View File

@ -7,6 +7,7 @@ and then calls the Node based driver
import atexit
import json
import logging
import re
import shutil
import signal
import subprocess
@ -19,6 +20,7 @@ from pathlib import Path
import inotify
import inotify.adapters
import requests
from warc2zim.main import main as warc2zim
from zimscraperlib.logging import getLogger
from zimscraperlib.uri import rebuild_uri
@ -28,6 +30,7 @@ from zimit.__about__ import __version__
EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
EXIT_CODE_CRAWLER_LIMIT_HIT = 11
NORMAL_WARC2ZIM_EXIT_CODE = 100
REQUESTS_TIMEOUT = 10
logger = getLogger(name="zimit", level=logging.INFO)
@ -354,6 +357,12 @@ def run(raw_args):
help="Crawler logging configuration",
)
parser.add_argument(
"--custom-behaviors",
help="JS code for custom behaviors to customize crawler. Single string with "
"individual JS files URL/path separated by a comma",
)
zimit_args, warc2zim_args = parser.parse_known_args(raw_args)
logger.info("Checking browsertrix-crawler version")
@ -434,6 +443,38 @@ def run(raw_args):
atexit.register(cleanup)
# copy / download custom behaviors to one single folder and configure crawler
if zimit_args.custom_behaviors:
behaviors_dir = temp_root_dir / "custom-behaviors"
behaviors_dir.mkdir()
for custom_behavior in [
custom_behavior.strip()
for custom_behavior in zimit_args.custom_behaviors.split(",")
]:
behaviors_file = tempfile.NamedTemporaryFile(
dir=behaviors_dir,
prefix="behavior_",
suffix=".js",
delete_on_close=False,
)
if re.match(r"^https?\://", custom_behavior):
logger.info(
f"Downloading browser profile from {custom_behavior} "
f"to {behaviors_file.name}"
)
resp = requests.get(custom_behavior, timeout=REQUESTS_TIMEOUT)
resp.raise_for_status()
Path(behaviors_file.name).write_bytes(resp.content)
else:
logger.info(
f"Copying browser profile from {custom_behavior} "
f"to {behaviors_file.name}"
)
shutil.copy(custom_behavior, behaviors_file.name)
zimit_args.customBehaviors = str(behaviors_dir)
else:
zimit_args.customBehaviors = None
cmd_args = get_node_cmd_line(zimit_args)
if url:
cmd_args.append("--url")
@ -551,6 +592,7 @@ def get_node_cmd_line(args):
"overwrite",
"config",
"logging",
"customBehaviors",
]:
value = getattr(args, arg)
if arg == "userAgent":