Code formatting

- Added requests as a dependency (although currently brought in by warc2zim)
- removed unused imports
- black code formatting and some cleanup
- revamped actual_url fetching
This commit is contained in:
renaud gaudin 2020-11-10 09:12:34 +00:00
parent 568068ecfc
commit 0082d313ae
2 changed files with 20 additions and 15 deletions

View File

@ -4,7 +4,7 @@ RUN mkdir -p /output
WORKDIR /app
RUN pip install 'warc2zim>=1.3.1'
RUN pip install 'warc2zim>=1.3.1' 'requests>=2.24.0'
ADD zimit.py /app/

View File

@ -9,18 +9,16 @@ and then calls the Node based driver
"""
from argparse import ArgumentParser
import os
import tempfile
import subprocess
import atexit
import shutil
import glob
import signal
import sys
from pathlib import Path
from urllib.parse import urlsplit
from warc2zim.main import warc2zim
from urllib.parse import urlsplit
import requests
@ -35,7 +33,7 @@ def zimit(args=None):
parser.add_argument(
"--newContext",
help="The context for each new capture, can be a new: page, session or browser.",
help="The context for each new capture (page, session or browser).",
choices=["page", "session", "browser"],
default="page",
)
@ -60,7 +58,8 @@ def zimit(args=None):
parser.add_argument(
"--scope",
help="Regex of page URLs that should be included in the crawl (defaults to the immediate directory of the URL)",
help="Regex of page URLs that should be included in the crawl "
"(defaults to the immediate directory of the URL)",
)
parser.add_argument(
@ -110,10 +109,11 @@ def zimit(args=None):
temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp"))
if not zimit_args.keep:
def cleanup():
print("")
print("----------")
print("Cleanup, removing temp dir: " + str(temp_root_dir))
print(f"Cleanup, removing temp dir: {temp_root_dir}")
shutil.rmtree(temp_root_dir)
atexit.register(cleanup)
@ -130,7 +130,7 @@ def zimit(args=None):
print("")
print("----------")
print("running browsertrix-crawler crawl: " + cmd_line)
print(f"running browsertrix-crawler crawl: {cmd_line}")
subprocess.run(cmd_args, check=True)
warc_files = temp_root_dir / "collections" / "capture" / "archive"
@ -140,26 +140,31 @@ def zimit(args=None):
print("")
print("----------")
print("Processing {0} WARC files to ZIM".format(num_files))
print(f"Processing {num_files} WARC files to ZIM")
return warc2zim(warc2zim_args)
def check_url(url):
resp = requests.get(url, allow_redirects=True, timeout=3)
actual_url = resp.url
try:
resp.close()
except Exception:
pass
resp = requests.head(url, stream=True, allow_redirects=True, timeout=10)
except requests.exceptions.RequestException as exc:
print(f"failed to connect to {url}: {exc}")
raise SystemExit(1)
actual_url = resp.url
if actual_url != url:
if urlsplit(url).netloc != urlsplit(actual_url).netloc:
raise ValueError("Main page URL ({0}) redirects to out-of-scope domain ({1}), cancelling crawl".format(url, actual_url))
raise ValueError(
f"Main page URL ({url}) redirects to out-of-scope domain "
f"({actual_url}), cancelling crawl"
)
return actual_url
return url
def get_node_cmd_line(args):
node_cmd = ["crawl"]
for arg in [