mirror of
https://github.com/openzim/zimit.git
synced 2025-09-23 03:52:16 -04:00
Code formatting
- Added requests as a dependency (although currently brought in by warc2zim) - removed unused imports - black code formatting and some cleanup - revamped actual_url fetching
This commit is contained in:
parent
568068ecfc
commit
0082d313ae
@ -4,7 +4,7 @@ RUN mkdir -p /output
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN pip install 'warc2zim>=1.3.1'
|
||||
RUN pip install 'warc2zim>=1.3.1' 'requests>=2.24.0'
|
||||
|
||||
ADD zimit.py /app/
|
||||
|
||||
|
33
zimit.py
33
zimit.py
@ -9,18 +9,16 @@ and then calls the Node based driver
|
||||
"""
|
||||
|
||||
from argparse import ArgumentParser
|
||||
import os
|
||||
import tempfile
|
||||
import subprocess
|
||||
import atexit
|
||||
import shutil
|
||||
import glob
|
||||
import signal
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlsplit
|
||||
|
||||
from warc2zim.main import warc2zim
|
||||
from urllib.parse import urlsplit
|
||||
import requests
|
||||
|
||||
|
||||
@ -35,7 +33,7 @@ def zimit(args=None):
|
||||
|
||||
parser.add_argument(
|
||||
"--newContext",
|
||||
help="The context for each new capture, can be a new: page, session or browser.",
|
||||
help="The context for each new capture (page, session or browser).",
|
||||
choices=["page", "session", "browser"],
|
||||
default="page",
|
||||
)
|
||||
@ -60,7 +58,8 @@ def zimit(args=None):
|
||||
|
||||
parser.add_argument(
|
||||
"--scope",
|
||||
help="Regex of page URLs that should be included in the crawl (defaults to the immediate directory of the URL)",
|
||||
help="Regex of page URLs that should be included in the crawl "
|
||||
"(defaults to the immediate directory of the URL)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
@ -110,10 +109,11 @@ def zimit(args=None):
|
||||
temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp"))
|
||||
|
||||
if not zimit_args.keep:
|
||||
|
||||
def cleanup():
|
||||
print("")
|
||||
print("----------")
|
||||
print("Cleanup, removing temp dir: " + str(temp_root_dir))
|
||||
print(f"Cleanup, removing temp dir: {temp_root_dir}")
|
||||
shutil.rmtree(temp_root_dir)
|
||||
|
||||
atexit.register(cleanup)
|
||||
@ -130,7 +130,7 @@ def zimit(args=None):
|
||||
|
||||
print("")
|
||||
print("----------")
|
||||
print("running browsertrix-crawler crawl: " + cmd_line)
|
||||
print(f"running browsertrix-crawler crawl: {cmd_line}")
|
||||
subprocess.run(cmd_args, check=True)
|
||||
|
||||
warc_files = temp_root_dir / "collections" / "capture" / "archive"
|
||||
@ -140,26 +140,31 @@ def zimit(args=None):
|
||||
|
||||
print("")
|
||||
print("----------")
|
||||
print("Processing {0} WARC files to ZIM".format(num_files))
|
||||
print(f"Processing {num_files} WARC files to ZIM")
|
||||
|
||||
return warc2zim(warc2zim_args)
|
||||
|
||||
|
||||
def check_url(url):
|
||||
resp = requests.get(url, allow_redirects=True, timeout=3)
|
||||
actual_url = resp.url
|
||||
try:
|
||||
resp.close()
|
||||
except Exception:
|
||||
pass
|
||||
resp = requests.head(url, stream=True, allow_redirects=True, timeout=10)
|
||||
except requests.exceptions.RequestException as exc:
|
||||
print(f"failed to connect to {url}: {exc}")
|
||||
raise SystemExit(1)
|
||||
actual_url = resp.url
|
||||
|
||||
if actual_url != url:
|
||||
if urlsplit(url).netloc != urlsplit(actual_url).netloc:
|
||||
raise ValueError("Main page URL ({0}) redirects to out-of-scope domain ({1}), cancelling crawl".format(url, actual_url))
|
||||
raise ValueError(
|
||||
f"Main page URL ({url}) redirects to out-of-scope domain "
|
||||
f"({actual_url}), cancelling crawl"
|
||||
)
|
||||
|
||||
return actual_url
|
||||
|
||||
return url
|
||||
|
||||
|
||||
def get_node_cmd_line(args):
|
||||
node_cmd = ["crawl"]
|
||||
for arg in [
|
||||
|
Loading…
x
Reference in New Issue
Block a user