diff --git a/Dockerfile b/Dockerfile index f6c09a8..61b0a86 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ RUN mkdir -p /output WORKDIR /app -RUN pip install 'warc2zim>=1.3.4' 'requests>=2.24.0' 'inotify==0.2.10' +RUN pip install 'warc2zim>=1.3.4' 'requests>=2.24.0' 'inotify==0.2.10' 'tld>=0.12,<0.13' ADD zimit.py /app/ diff --git a/zimit.py b/zimit.py index 382b688..c5c7d36 100755 --- a/zimit.py +++ b/zimit.py @@ -8,6 +8,7 @@ This script validates arguments with warc2zim, checks permissions and then calls the Node based driver """ +import re from argparse import ArgumentParser import tempfile import subprocess @@ -17,13 +18,13 @@ import signal import sys import json from pathlib import Path -from urllib.parse import urlsplit from multiprocessing import Process from warc2zim.main import warc2zim import requests import inotify import inotify.adapters +from tld import get_fld class ProgressFileWatcher: @@ -184,7 +185,7 @@ def zimit(args=None): url = zimit_args.url if url: - url = check_url(url) + url = check_url(url, zimit_args.scope) warc2zim_args.append("--url") warc2zim_args.append(url) @@ -264,7 +265,7 @@ def zimit(args=None): return warc2zim(warc2zim_args) -def check_url(url): +def check_url(url, scope=None): try: resp = requests.head(url, stream=True, allow_redirects=True, timeout=10) except requests.exceptions.RequestException as exc: @@ -273,13 +274,22 @@ def check_url(url): actual_url = resp.url if actual_url != url: - if urlsplit(url).netloc != urlsplit(actual_url).netloc: - raise ValueError( - f"Main page URL ({url}) redirects to out-of-scope domain " - f"({actual_url}), cancelling crawl" - ) + # redirect on same domain or same first-level domain + if get_fld(url) == get_fld(actual_url): + return actual_url - return actual_url + # is it in scope? + if scope: + try: + if re.match(scope, actual_url): + return actual_url + except Exception as exc: + print(f"failed to parse your scope regexp for url checking: {exc}") + + raise ValueError( + f"Main page URL ({url}) redirects to out-of-scope domain " + f"({actual_url}), cancelling crawl" + ) return url