Code formatting

- Added requests as a dependency (although currently brought in by warc2zim) - removed unused imports - black code formatting and some cleanup - revamped actual_url fetching
2025-09-23 03:52:16 -04:00 · 2020-11-10 09:12:34 +00:00 · 2020-11-10 09:12:34 +00:00 · 0082d313ae
commit 0082d313ae
parent 568068ecfc
2 changed files with 20 additions and 15 deletions
--- a/2
+++ b/2
@ -4,7 +4,7 @@ RUN mkdir -p /output

 WORKDIR /app

-RUN pip install 'warc2zim>=1.3.1'
+RUN pip install 'warc2zim>=1.3.1' 'requests>=2.24.0'

 ADD zimit.py /app/

--- a/zimit.py
+++ b/zimit.py
@ -9,18 +9,16 @@ and then calls the Node based driver
 """

 from argparse import ArgumentParser
-import os
 import tempfile
 import subprocess
 import atexit
 import shutil
-import glob
 import signal
 import sys
 from pathlib import Path
+from urllib.parse import urlsplit

 from warc2zim.main import warc2zim
-from urllib.parse import urlsplit
 import requests


@ -35,7 +33,7 @@ def zimit(args=None):

    parser.add_argument(
        "--newContext",
-        help="The context for each new capture, can be a new: page, session or browser.",
+        help="The context for each new capture (page, session or browser).",
        choices=["page", "session", "browser"],
        default="page",
    )
@ -60,7 +58,8 @@ def zimit(args=None):

    parser.add_argument(
        "--scope",
-        help="Regex of page URLs that should be included in the crawl (defaults to the immediate directory of the URL)",
+        help="Regex of page URLs that should be included in the crawl "
+        "(defaults to the immediate directory of the URL)",
    )

    parser.add_argument(
@ -110,10 +109,11 @@ def zimit(args=None):
    temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp"))

    if not zimit_args.keep:
+
        def cleanup():
            print("")
            print("----------")
-            print("Cleanup, removing temp dir: " + str(temp_root_dir))
+            print(f"Cleanup, removing temp dir: {temp_root_dir}")
            shutil.rmtree(temp_root_dir)

        atexit.register(cleanup)
@ -130,7 +130,7 @@ def zimit(args=None):

    print("")
    print("----------")
-    print("running browsertrix-crawler crawl: " + cmd_line)
+    print(f"running browsertrix-crawler crawl: {cmd_line}")
    subprocess.run(cmd_args, check=True)

    warc_files = temp_root_dir / "collections" / "capture" / "archive"
@ -140,26 +140,31 @@ def zimit(args=None):

    print("")
    print("----------")
-    print("Processing {0} WARC files to ZIM".format(num_files))
+    print(f"Processing {num_files} WARC files to ZIM")

    return warc2zim(warc2zim_args)

+
 def check_url(url):
-    resp = requests.get(url, allow_redirects=True, timeout=3)
-    actual_url = resp.url
    try:
-        resp.close()
-    except Exception:
-        pass
+        resp = requests.head(url, stream=True, allow_redirects=True, timeout=10)
+    except requests.exceptions.RequestException as exc:
+        print(f"failed to connect to {url}: {exc}")
+        raise SystemExit(1)
+    actual_url = resp.url

    if actual_url != url:
        if urlsplit(url).netloc != urlsplit(actual_url).netloc:
-            raise ValueError("Main page URL ({0}) redirects to out-of-scope domain ({1}), cancelling crawl".format(url, actual_url))
+            raise ValueError(
+                f"Main page URL ({url}) redirects to out-of-scope domain "
+                f"({actual_url}), cancelling crawl"
+            )

        return actual_url

    return url

+
 def get_node_cmd_line(args):
    node_cmd = ["crawl"]
    for arg in [