mobile + user agent support:

- add support for custom user agent suffix +Zimit with email address specifyable via --adminEmail cmd arg #38
- add ability to crawl as mobile device with --mobileDevice flag (default to iPhone X)
add integration tests runnable in docker via github actions
logging: print temp dir, flush print statements for immediate logging
This commit is contained in:
Ilya Kreymer 2020-11-14 19:36:47 +00:00
parent 0e3af5124b
commit a930542af8
4 changed files with 65 additions and 12 deletions

View File

@ -13,7 +13,7 @@ jobs:
run: docker build -t openzim/zimit:dev .
- name: run crawl
run: docker run -v $PWD/output:/output openzim/zimit:dev zimit --url http://isago.ml/ --name isago --zim-file isago.zim
run: docker run -v $PWD/output:/output openzim/zimit:dev zimit --url http://isago.ml/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice
- name: ensure zim exists
run: stat ./output/isago.zim
- name: run integration test suite
run: docker run -it -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output openzim/zimit:dev bash -c "pip install pytest; pytest -v ./test/integration.py"

View File

@ -1,10 +1,11 @@
FROM webrecorder/browsertrix-crawler:0.1.1
FROM webrecorder/browsertrix-crawler:0.1.2
RUN mkdir -p /output
WORKDIR /app
RUN pip install 'warc2zim>=1.3.1' 'requests>=2.24.0'
#RUN pip install 'warc2zim>=1.3.1' 'requests>=2.24.0'
RUN pip install git+https://github.com/openzim/warc2zim.git@replay-update
ADD zimit.py /app/

38
test/integration.py Normal file
View File

@ -0,0 +1,38 @@
import os
import glob
import libzim.reader
from warcio import ArchiveIterator
def get_zim_article(zimfile, path):
zim_fh = libzim.reader.File(zimfile)
return zim_fh.get_article(path).content.tobytes()
def test_is_file():
""" Ensure ZIM file exists"""
assert os.path.isfile("/output/isago.zim")
def test_zim_main_page():
""" Main page specified, http://isago.ml/, was a redirect to https
Ensure main page is the redirected page"""
assert b'"https://isago.ml/"' in get_zim_article("/output/isago.zim", "A/index.html")
def test_user_agent():
""" Test that mobile user agent was used in WARC request records with custom Zimit and email suffix"""
#result = get_zim_article("/output/isago.zim", "H/isago.ml/")
#print(result)
for warc in glob.glob("/output/.tmp*/collections/capture/archive/*.warc.gz"):
with open(warc, "rb") as fh:
for record in ArchiveIterator(fh):
if record.rec_type == "request":
print(record.http_headers)
ua = record.http_headers.get_header("User-Agent")
if ua:
assert "iPhone" in ua
assert ua.endswith(" +Zimit test@example.com")
return
# not found
assert False

View File

@ -83,6 +83,12 @@ def zimit(args=None):
"--output", help="Output directory for ZIM and WARC files", default="/output"
)
parser.add_argument("--adminEmail", help="Admin Email for Zimit crawler")
parser.add_argument(
"--mobileDevice", help="Crawl as Mobile Device", nargs="?", const="iPhone X"
)
zimit_args, warc2zim_args = parser.parse_known_args(args)
# pass url and output to warc2zim also
@ -99,7 +105,7 @@ def zimit(args=None):
print("----------")
print("Testing warc2zim args")
print("Running: warc2zim " + " ".join(warc2zim_args))
print("Running: warc2zim " + " ".join(warc2zim_args), flush=True)
res = warc2zim(warc2zim_args)
if res != 100:
print("Exiting, invalid warc2zim params")
@ -109,11 +115,10 @@ def zimit(args=None):
temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp"))
if not zimit_args.keep:
def cleanup():
print("")
print("----------")
print(f"Cleanup, removing temp dir: {temp_root_dir}")
print(f"Cleanup, removing temp dir: {temp_root_dir}", flush=True)
shutil.rmtree(temp_root_dir)
atexit.register(cleanup)
@ -123,6 +128,13 @@ def zimit(args=None):
cmd_args.append("--url")
cmd_args.append(url)
user_agent_suffix = "+Zimit "
if zimit_args.adminEmail:
user_agent_suffix += zimit_args.adminEmail
cmd_args.append("--userAgentSuffix")
cmd_args.append(user_agent_suffix)
cmd_args.append("--cwd")
cmd_args.append(str(temp_root_dir))
@ -130,7 +142,8 @@ def zimit(args=None):
print("")
print("----------")
print(f"running browsertrix-crawler crawl: {cmd_line}")
print(f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}")
print(f"Running browsertrix-crawler crawl: {cmd_line}", flush=True)
subprocess.run(cmd_args, check=True)
warc_files = temp_root_dir / "collections" / "capture" / "archive"
@ -140,7 +153,7 @@ def zimit(args=None):
print("")
print("----------")
print(f"Processing {num_files} WARC files to ZIM")
print(f"Processing {num_files} WARC files to ZIM", flush=True)
return warc2zim(warc2zim_args)
@ -149,7 +162,7 @@ def check_url(url):
try:
resp = requests.head(url, stream=True, allow_redirects=True, timeout=10)
except requests.exceptions.RequestException as exc:
print(f"failed to connect to {url}: {exc}")
print(f"failed to connect to {url}: {exc}", flush=True)
raise SystemExit(1)
actual_url = resp.url
@ -176,6 +189,7 @@ def get_node_cmd_line(args):
"scope",
"exclude",
"scroll",
"mobileDevice",
]:
value = getattr(args, arg)
if value:
@ -191,7 +205,7 @@ def sigint_handler(*args):
print("")
print("SIGINT/SIGTERM received, stopping zimit")
print("")
print("")
print("", flush=True)
sys.exit(3)