Code formatting

- Added requests as a dependency (although currently brought in by warc2zim)
- removed unused imports
- black code formatting and some cleanup
- revamped actual_url fetching
This commit is contained in:
renaud gaudin 2020-11-10 09:12:34 +00:00
parent 568068ecfc
commit 0082d313ae
2 changed files with 20 additions and 15 deletions

View File

@ -4,7 +4,7 @@ RUN mkdir -p /output
WORKDIR /app WORKDIR /app
RUN pip install 'warc2zim>=1.3.1' RUN pip install 'warc2zim>=1.3.1' 'requests>=2.24.0'
ADD zimit.py /app/ ADD zimit.py /app/

View File

@ -9,18 +9,16 @@ and then calls the Node based driver
""" """
from argparse import ArgumentParser from argparse import ArgumentParser
import os
import tempfile import tempfile
import subprocess import subprocess
import atexit import atexit
import shutil import shutil
import glob
import signal import signal
import sys import sys
from pathlib import Path from pathlib import Path
from urllib.parse import urlsplit
from warc2zim.main import warc2zim from warc2zim.main import warc2zim
from urllib.parse import urlsplit
import requests import requests
@ -35,7 +33,7 @@ def zimit(args=None):
parser.add_argument( parser.add_argument(
"--newContext", "--newContext",
help="The context for each new capture, can be a new: page, session or browser.", help="The context for each new capture (page, session or browser).",
choices=["page", "session", "browser"], choices=["page", "session", "browser"],
default="page", default="page",
) )
@ -60,7 +58,8 @@ def zimit(args=None):
parser.add_argument( parser.add_argument(
"--scope", "--scope",
help="Regex of page URLs that should be included in the crawl (defaults to the immediate directory of the URL)", help="Regex of page URLs that should be included in the crawl "
"(defaults to the immediate directory of the URL)",
) )
parser.add_argument( parser.add_argument(
@ -110,10 +109,11 @@ def zimit(args=None):
temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp")) temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp"))
if not zimit_args.keep: if not zimit_args.keep:
def cleanup(): def cleanup():
print("") print("")
print("----------") print("----------")
print("Cleanup, removing temp dir: " + str(temp_root_dir)) print(f"Cleanup, removing temp dir: {temp_root_dir}")
shutil.rmtree(temp_root_dir) shutil.rmtree(temp_root_dir)
atexit.register(cleanup) atexit.register(cleanup)
@ -130,7 +130,7 @@ def zimit(args=None):
print("") print("")
print("----------") print("----------")
print("running browsertrix-crawler crawl: " + cmd_line) print(f"running browsertrix-crawler crawl: {cmd_line}")
subprocess.run(cmd_args, check=True) subprocess.run(cmd_args, check=True)
warc_files = temp_root_dir / "collections" / "capture" / "archive" warc_files = temp_root_dir / "collections" / "capture" / "archive"
@ -140,26 +140,31 @@ def zimit(args=None):
print("") print("")
print("----------") print("----------")
print("Processing {0} WARC files to ZIM".format(num_files)) print(f"Processing {num_files} WARC files to ZIM")
return warc2zim(warc2zim_args) return warc2zim(warc2zim_args)
def check_url(url): def check_url(url):
resp = requests.get(url, allow_redirects=True, timeout=3)
actual_url = resp.url
try: try:
resp.close() resp = requests.head(url, stream=True, allow_redirects=True, timeout=10)
except Exception: except requests.exceptions.RequestException as exc:
pass print(f"failed to connect to {url}: {exc}")
raise SystemExit(1)
actual_url = resp.url
if actual_url != url: if actual_url != url:
if urlsplit(url).netloc != urlsplit(actual_url).netloc: if urlsplit(url).netloc != urlsplit(actual_url).netloc:
raise ValueError("Main page URL ({0}) redirects to out-of-scope domain ({1}), cancelling crawl".format(url, actual_url)) raise ValueError(
f"Main page URL ({url}) redirects to out-of-scope domain "
f"({actual_url}), cancelling crawl"
)
return actual_url return actual_url
return url return url
def get_node_cmd_line(args): def get_node_cmd_line(args):
node_cmd = ["crawl"] node_cmd = ["crawl"]
for arg in [ for arg in [