mirror of
https://github.com/openzim/zimit.git
synced 2025-09-24 04:30:11 -04:00
Code formatting
- Added requests as a dependency (although currently brought in by warc2zim) - removed unused imports - black code formatting and some cleanup - revamped actual_url fetching
This commit is contained in:
parent
568068ecfc
commit
0082d313ae
@ -4,7 +4,7 @@ RUN mkdir -p /output
|
|||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
RUN pip install 'warc2zim>=1.3.1'
|
RUN pip install 'warc2zim>=1.3.1' 'requests>=2.24.0'
|
||||||
|
|
||||||
ADD zimit.py /app/
|
ADD zimit.py /app/
|
||||||
|
|
||||||
|
33
zimit.py
33
zimit.py
@ -9,18 +9,16 @@ and then calls the Node based driver
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
import os
|
|
||||||
import tempfile
|
import tempfile
|
||||||
import subprocess
|
import subprocess
|
||||||
import atexit
|
import atexit
|
||||||
import shutil
|
import shutil
|
||||||
import glob
|
|
||||||
import signal
|
import signal
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from urllib.parse import urlsplit
|
||||||
|
|
||||||
from warc2zim.main import warc2zim
|
from warc2zim.main import warc2zim
|
||||||
from urllib.parse import urlsplit
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
|
||||||
@ -35,7 +33,7 @@ def zimit(args=None):
|
|||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--newContext",
|
"--newContext",
|
||||||
help="The context for each new capture, can be a new: page, session or browser.",
|
help="The context for each new capture (page, session or browser).",
|
||||||
choices=["page", "session", "browser"],
|
choices=["page", "session", "browser"],
|
||||||
default="page",
|
default="page",
|
||||||
)
|
)
|
||||||
@ -60,7 +58,8 @@ def zimit(args=None):
|
|||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--scope",
|
"--scope",
|
||||||
help="Regex of page URLs that should be included in the crawl (defaults to the immediate directory of the URL)",
|
help="Regex of page URLs that should be included in the crawl "
|
||||||
|
"(defaults to the immediate directory of the URL)",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -110,10 +109,11 @@ def zimit(args=None):
|
|||||||
temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp"))
|
temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp"))
|
||||||
|
|
||||||
if not zimit_args.keep:
|
if not zimit_args.keep:
|
||||||
|
|
||||||
def cleanup():
|
def cleanup():
|
||||||
print("")
|
print("")
|
||||||
print("----------")
|
print("----------")
|
||||||
print("Cleanup, removing temp dir: " + str(temp_root_dir))
|
print(f"Cleanup, removing temp dir: {temp_root_dir}")
|
||||||
shutil.rmtree(temp_root_dir)
|
shutil.rmtree(temp_root_dir)
|
||||||
|
|
||||||
atexit.register(cleanup)
|
atexit.register(cleanup)
|
||||||
@ -130,7 +130,7 @@ def zimit(args=None):
|
|||||||
|
|
||||||
print("")
|
print("")
|
||||||
print("----------")
|
print("----------")
|
||||||
print("running browsertrix-crawler crawl: " + cmd_line)
|
print(f"running browsertrix-crawler crawl: {cmd_line}")
|
||||||
subprocess.run(cmd_args, check=True)
|
subprocess.run(cmd_args, check=True)
|
||||||
|
|
||||||
warc_files = temp_root_dir / "collections" / "capture" / "archive"
|
warc_files = temp_root_dir / "collections" / "capture" / "archive"
|
||||||
@ -140,26 +140,31 @@ def zimit(args=None):
|
|||||||
|
|
||||||
print("")
|
print("")
|
||||||
print("----------")
|
print("----------")
|
||||||
print("Processing {0} WARC files to ZIM".format(num_files))
|
print(f"Processing {num_files} WARC files to ZIM")
|
||||||
|
|
||||||
return warc2zim(warc2zim_args)
|
return warc2zim(warc2zim_args)
|
||||||
|
|
||||||
|
|
||||||
def check_url(url):
|
def check_url(url):
|
||||||
resp = requests.get(url, allow_redirects=True, timeout=3)
|
|
||||||
actual_url = resp.url
|
|
||||||
try:
|
try:
|
||||||
resp.close()
|
resp = requests.head(url, stream=True, allow_redirects=True, timeout=10)
|
||||||
except Exception:
|
except requests.exceptions.RequestException as exc:
|
||||||
pass
|
print(f"failed to connect to {url}: {exc}")
|
||||||
|
raise SystemExit(1)
|
||||||
|
actual_url = resp.url
|
||||||
|
|
||||||
if actual_url != url:
|
if actual_url != url:
|
||||||
if urlsplit(url).netloc != urlsplit(actual_url).netloc:
|
if urlsplit(url).netloc != urlsplit(actual_url).netloc:
|
||||||
raise ValueError("Main page URL ({0}) redirects to out-of-scope domain ({1}), cancelling crawl".format(url, actual_url))
|
raise ValueError(
|
||||||
|
f"Main page URL ({url}) redirects to out-of-scope domain "
|
||||||
|
f"({actual_url}), cancelling crawl"
|
||||||
|
)
|
||||||
|
|
||||||
return actual_url
|
return actual_url
|
||||||
|
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
def get_node_cmd_line(args):
|
def get_node_cmd_line(args):
|
||||||
node_cmd = ["crawl"]
|
node_cmd = ["crawl"]
|
||||||
for arg in [
|
for arg in [
|
||||||
|
Loading…
x
Reference in New Issue
Block a user