Code formatting

- Added requests as a dependency (although currently brought in by warc2zim) - removed unused imports - black code formatting and some cleanup - revamped actual_url fetching
2025-09-24 04:30:11 -04:00 · 2020-11-10 09:12:34 +00:00 · 2020-11-10 09:12:34 +00:00 · 0082d313ae
commit 0082d313ae
parent 568068ecfc
2 changed files with 20 additions and 15 deletions
--- a/2
+++ b/2
@ -4,7 +4,7 @@ RUN mkdir -p /output
 WORKDIR /app
-RUN pip install 'warc2zim>=1.3.1'
+RUN pip install 'warc2zim>=1.3.1' 'requests>=2.24.0'
 ADD zimit.py /app/
--- a/zimit.py
+++ b/zimit.py
@ -9,18 +9,16 @@ and then calls the Node based driver
 """
 from argparse import ArgumentParser
 import os
 import tempfile
 import subprocess
 import atexit
 import shutil
 import glob
 import signal
 import sys
 from pathlib import Path
 from urllib.parse import urlsplit
 from warc2zim.main import warc2zim
 from urllib.parse import urlsplit
 import requests
@ -35,7 +33,7 @@ def zimit(args=None):
    parser.add_argument(
        "--newContext",
-        help="The context for each new capture, can be a new: page, session or browser.",
+        help="The context for each new capture (page, session or browser).",
        choices=["page", "session", "browser"],
        default="page",
    )
@ -60,7 +58,8 @@ def zimit(args=None):
    parser.add_argument(
        "--scope",
-        help="Regex of page URLs that should be included in the crawl (defaults to the immediate directory of the URL)",
+        help="Regex of page URLs that should be included in the crawl "
        "(defaults to the immediate directory of the URL)",
    )
    parser.add_argument(
@ -110,10 +109,11 @@ def zimit(args=None):
    temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp"))
    if not zimit_args.keep:
        def cleanup():
            print("")
            print("----------")
-            print("Cleanup, removing temp dir: " + str(temp_root_dir))
+            print(f"Cleanup, removing temp dir: {temp_root_dir}")
            shutil.rmtree(temp_root_dir)
        atexit.register(cleanup)
@ -130,7 +130,7 @@ def zimit(args=None):
    print("")
    print("----------")
-    print("running browsertrix-crawler crawl: " + cmd_line)
+    print(f"running browsertrix-crawler crawl: {cmd_line}")
    subprocess.run(cmd_args, check=True)
    warc_files = temp_root_dir / "collections" / "capture" / "archive"
@ -140,26 +140,31 @@ def zimit(args=None):
    print("")
    print("----------")
-    print("Processing {0} WARC files to ZIM".format(num_files))
+    print(f"Processing {num_files} WARC files to ZIM")
    return warc2zim(warc2zim_args)
 def check_url(url):
    resp = requests.get(url, allow_redirects=True, timeout=3)
    actual_url = resp.url
    try:
-        resp.close()
+        resp = requests.head(url, stream=True, allow_redirects=True, timeout=10)
-    except Exception:
+    except requests.exceptions.RequestException as exc:
-        pass
+        print(f"failed to connect to {url}: {exc}")
        raise SystemExit(1)
    actual_url = resp.url
    if actual_url != url:
        if urlsplit(url).netloc != urlsplit(actual_url).netloc:
-            raise ValueError("Main page URL ({0}) redirects to out-of-scope domain ({1}), cancelling crawl".format(url, actual_url))
+            raise ValueError(
                f"Main page URL ({url}) redirects to out-of-scope domain "
                f"({actual_url}), cancelling crawl"
            )
        return actual_url
    return url
 def get_node_cmd_line(args):
    node_cmd = ["crawl"]
    for arg in [