From 0082d313aea40e83f9f07b600bd6dcd224d578a8 Mon Sep 17 00:00:00 2001 From: renaud gaudin Date: Tue, 10 Nov 2020 09:12:34 +0000 Subject: [PATCH] Code formatting - Added requests as a dependency (although currently brought in by warc2zim) - removed unused imports - black code formatting and some cleanup - revamped actual_url fetching --- Dockerfile | 2 +- zimit.py | 33 +++++++++++++++++++-------------- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/Dockerfile b/Dockerfile index f2a35f2..0a78e00 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,7 @@ RUN mkdir -p /output WORKDIR /app -RUN pip install 'warc2zim>=1.3.1' +RUN pip install 'warc2zim>=1.3.1' 'requests>=2.24.0' ADD zimit.py /app/ diff --git a/zimit.py b/zimit.py index ccb591e..185299d 100755 --- a/zimit.py +++ b/zimit.py @@ -9,18 +9,16 @@ and then calls the Node based driver """ from argparse import ArgumentParser -import os import tempfile import subprocess import atexit import shutil -import glob import signal import sys from pathlib import Path +from urllib.parse import urlsplit from warc2zim.main import warc2zim -from urllib.parse import urlsplit import requests @@ -35,7 +33,7 @@ def zimit(args=None): parser.add_argument( "--newContext", - help="The context for each new capture, can be a new: page, session or browser.", + help="The context for each new capture (page, session or browser).", choices=["page", "session", "browser"], default="page", ) @@ -60,7 +58,8 @@ def zimit(args=None): parser.add_argument( "--scope", - help="Regex of page URLs that should be included in the crawl (defaults to the immediate directory of the URL)", + help="Regex of page URLs that should be included in the crawl " + "(defaults to the immediate directory of the URL)", ) parser.add_argument( @@ -110,10 +109,11 @@ def zimit(args=None): temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp")) if not zimit_args.keep: + def cleanup(): print("") print("----------") - print("Cleanup, removing temp dir: " + str(temp_root_dir)) + print(f"Cleanup, removing temp dir: {temp_root_dir}") shutil.rmtree(temp_root_dir) atexit.register(cleanup) @@ -130,7 +130,7 @@ def zimit(args=None): print("") print("----------") - print("running browsertrix-crawler crawl: " + cmd_line) + print(f"running browsertrix-crawler crawl: {cmd_line}") subprocess.run(cmd_args, check=True) warc_files = temp_root_dir / "collections" / "capture" / "archive" @@ -140,26 +140,31 @@ def zimit(args=None): print("") print("----------") - print("Processing {0} WARC files to ZIM".format(num_files)) + print(f"Processing {num_files} WARC files to ZIM") return warc2zim(warc2zim_args) + def check_url(url): - resp = requests.get(url, allow_redirects=True, timeout=3) - actual_url = resp.url try: - resp.close() - except Exception: - pass + resp = requests.head(url, stream=True, allow_redirects=True, timeout=10) + except requests.exceptions.RequestException as exc: + print(f"failed to connect to {url}: {exc}") + raise SystemExit(1) + actual_url = resp.url if actual_url != url: if urlsplit(url).netloc != urlsplit(actual_url).netloc: - raise ValueError("Main page URL ({0}) redirects to out-of-scope domain ({1}), cancelling crawl".format(url, actual_url)) + raise ValueError( + f"Main page URL ({url}) redirects to out-of-scope domain " + f"({actual_url}), cancelling crawl" + ) return actual_url return url + def get_node_cmd_line(args): node_cmd = ["crawl"] for arg in [