mirror of
https://github.com/openzim/zimit.git
synced 2025-09-24 04:30:11 -04:00
Merge pull request #54 from openzim/mobile-useragent
Mobile Device + User Agent Support
This commit is contained in:
commit
9422defe86
6
.github/workflows/ci.yaml
vendored
6
.github/workflows/ci.yaml
vendored
@ -13,7 +13,7 @@ jobs:
|
|||||||
run: docker build -t openzim/zimit:dev .
|
run: docker build -t openzim/zimit:dev .
|
||||||
|
|
||||||
- name: run crawl
|
- name: run crawl
|
||||||
run: docker run -v $PWD/output:/output openzim/zimit:dev zimit --url http://isago.ml/ --name isago --zim-file isago.zim
|
run: docker run -v $PWD/output:/output openzim/zimit:dev zimit --url http://isago.ml/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice --keep
|
||||||
|
|
||||||
- name: ensure zim exists
|
- name: run integration test suite
|
||||||
run: stat ./output/isago.zim
|
run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output openzim/zimit:dev bash -c "pip install pytest; pytest -v ./integration.py"
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
FROM webrecorder/browsertrix-crawler:0.1.1
|
FROM webrecorder/browsertrix-crawler:0.1.2
|
||||||
|
|
||||||
RUN mkdir -p /output
|
RUN mkdir -p /output
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
RUN pip install 'warc2zim>=1.3.1' 'requests>=2.24.0'
|
RUN pip install 'warc2zim>=1.3.2' 'requests>=2.24.0'
|
||||||
|
|
||||||
ADD zimit.py /app/
|
ADD zimit.py /app/
|
||||||
|
|
||||||
|
43
test/integration.py
Normal file
43
test/integration.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
import os
|
||||||
|
import glob
|
||||||
|
|
||||||
|
import libzim.reader
|
||||||
|
from warcio import ArchiveIterator
|
||||||
|
|
||||||
|
|
||||||
|
def get_zim_article(zimfile, path):
|
||||||
|
zim_fh = libzim.reader.File(zimfile)
|
||||||
|
return zim_fh.get_article(path).content.tobytes()
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_file():
|
||||||
|
""" Ensure ZIM file exists"""
|
||||||
|
assert os.path.isfile("/output/isago.zim")
|
||||||
|
|
||||||
|
|
||||||
|
def test_zim_main_page():
|
||||||
|
"""Main page specified, http://isago.ml/, was a redirect to https
|
||||||
|
Ensure main page is the redirected page"""
|
||||||
|
|
||||||
|
assert b'"https://isago.ml/"' in get_zim_article(
|
||||||
|
"/output/isago.zim", "A/index.html"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_user_agent():
|
||||||
|
""" Test that mobile user agent was used in WARC request records with custom Zimit and email suffix"""
|
||||||
|
|
||||||
|
found = False
|
||||||
|
for warc in glob.glob("/output/.tmp*/collections/capture/archive/*.warc.gz"):
|
||||||
|
with open(warc, "rb") as fh:
|
||||||
|
for record in ArchiveIterator(fh):
|
||||||
|
if record.rec_type == "request":
|
||||||
|
print(record.http_headers)
|
||||||
|
ua = record.http_headers.get_header("User-Agent")
|
||||||
|
if ua:
|
||||||
|
assert "iPhone" in ua
|
||||||
|
assert ua.endswith(" +Zimit test@example.com")
|
||||||
|
found = True
|
||||||
|
|
||||||
|
# should find at least one
|
||||||
|
assert found
|
35
zimit.py
35
zimit.py
@ -83,6 +83,17 @@ def zimit(args=None):
|
|||||||
"--output", help="Output directory for ZIM and WARC files", default="/output"
|
"--output", help="Output directory for ZIM and WARC files", default="/output"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument("--adminEmail", help="Admin Email for Zimit crawler")
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--mobileDevice", help="Crawl as Mobile Device", nargs="?", const="iPhone X"
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--useSitemap",
|
||||||
|
help="If set, use the URL as sitemap to get additional URLs for the crawl (usually /sitemap.xml)",
|
||||||
|
)
|
||||||
|
|
||||||
zimit_args, warc2zim_args = parser.parse_known_args(args)
|
zimit_args, warc2zim_args = parser.parse_known_args(args)
|
||||||
|
|
||||||
# pass url and output to warc2zim also
|
# pass url and output to warc2zim also
|
||||||
@ -99,7 +110,7 @@ def zimit(args=None):
|
|||||||
|
|
||||||
print("----------")
|
print("----------")
|
||||||
print("Testing warc2zim args")
|
print("Testing warc2zim args")
|
||||||
print("Running: warc2zim " + " ".join(warc2zim_args))
|
print("Running: warc2zim " + " ".join(warc2zim_args), flush=True)
|
||||||
res = warc2zim(warc2zim_args)
|
res = warc2zim(warc2zim_args)
|
||||||
if res != 100:
|
if res != 100:
|
||||||
print("Exiting, invalid warc2zim params")
|
print("Exiting, invalid warc2zim params")
|
||||||
@ -113,7 +124,7 @@ def zimit(args=None):
|
|||||||
def cleanup():
|
def cleanup():
|
||||||
print("")
|
print("")
|
||||||
print("----------")
|
print("----------")
|
||||||
print(f"Cleanup, removing temp dir: {temp_root_dir}")
|
print(f"Cleanup, removing temp dir: {temp_root_dir}", flush=True)
|
||||||
shutil.rmtree(temp_root_dir)
|
shutil.rmtree(temp_root_dir)
|
||||||
|
|
||||||
atexit.register(cleanup)
|
atexit.register(cleanup)
|
||||||
@ -123,6 +134,13 @@ def zimit(args=None):
|
|||||||
cmd_args.append("--url")
|
cmd_args.append("--url")
|
||||||
cmd_args.append(url)
|
cmd_args.append(url)
|
||||||
|
|
||||||
|
user_agent_suffix = "+Zimit "
|
||||||
|
if zimit_args.adminEmail:
|
||||||
|
user_agent_suffix += zimit_args.adminEmail
|
||||||
|
|
||||||
|
cmd_args.append("--userAgentSuffix")
|
||||||
|
cmd_args.append(user_agent_suffix)
|
||||||
|
|
||||||
cmd_args.append("--cwd")
|
cmd_args.append("--cwd")
|
||||||
cmd_args.append(str(temp_root_dir))
|
cmd_args.append(str(temp_root_dir))
|
||||||
|
|
||||||
@ -130,7 +148,10 @@ def zimit(args=None):
|
|||||||
|
|
||||||
print("")
|
print("")
|
||||||
print("----------")
|
print("----------")
|
||||||
print(f"running browsertrix-crawler crawl: {cmd_line}")
|
print(
|
||||||
|
f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}"
|
||||||
|
)
|
||||||
|
print(f"Running browsertrix-crawler crawl: {cmd_line}", flush=True)
|
||||||
subprocess.run(cmd_args, check=True)
|
subprocess.run(cmd_args, check=True)
|
||||||
|
|
||||||
warc_files = temp_root_dir / "collections" / "capture" / "archive"
|
warc_files = temp_root_dir / "collections" / "capture" / "archive"
|
||||||
@ -140,7 +161,7 @@ def zimit(args=None):
|
|||||||
|
|
||||||
print("")
|
print("")
|
||||||
print("----------")
|
print("----------")
|
||||||
print(f"Processing {num_files} WARC files to ZIM")
|
print(f"Processing {num_files} WARC files to ZIM", flush=True)
|
||||||
|
|
||||||
return warc2zim(warc2zim_args)
|
return warc2zim(warc2zim_args)
|
||||||
|
|
||||||
@ -149,7 +170,7 @@ def check_url(url):
|
|||||||
try:
|
try:
|
||||||
resp = requests.head(url, stream=True, allow_redirects=True, timeout=10)
|
resp = requests.head(url, stream=True, allow_redirects=True, timeout=10)
|
||||||
except requests.exceptions.RequestException as exc:
|
except requests.exceptions.RequestException as exc:
|
||||||
print(f"failed to connect to {url}: {exc}")
|
print(f"failed to connect to {url}: {exc}", flush=True)
|
||||||
raise SystemExit(1)
|
raise SystemExit(1)
|
||||||
actual_url = resp.url
|
actual_url = resp.url
|
||||||
|
|
||||||
@ -176,6 +197,8 @@ def get_node_cmd_line(args):
|
|||||||
"scope",
|
"scope",
|
||||||
"exclude",
|
"exclude",
|
||||||
"scroll",
|
"scroll",
|
||||||
|
"mobileDevice",
|
||||||
|
"useSitemap",
|
||||||
]:
|
]:
|
||||||
value = getattr(args, arg)
|
value = getattr(args, arg)
|
||||||
if value:
|
if value:
|
||||||
@ -191,7 +214,7 @@ def sigint_handler(*args):
|
|||||||
print("")
|
print("")
|
||||||
print("SIGINT/SIGTERM received, stopping zimit")
|
print("SIGINT/SIGTERM received, stopping zimit")
|
||||||
print("")
|
print("")
|
||||||
print("")
|
print("", flush=True)
|
||||||
sys.exit(3)
|
sys.exit(3)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user