Merge pull request #108 from openzim/crawler-with-video

update to latest browsertrix-crawler + warc2zim
This commit is contained in:
rgaudin 2022-06-21 16:59:15 +00:00 committed by GitHub
commit b2bb77cd65
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 204 additions and 52 deletions

View File

@ -13,7 +13,7 @@ jobs:
run: docker build -t zimit .
- name: run crawl
run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice --statsFilename /output/stats.json --keep
run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "iPhone 11" --statsFilename /output/stats.json --keep
- name: run integration test suite
run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "pip install pytest; pytest -v ./integration.py"

View File

@ -1,8 +1,46 @@
# 1.1.5
## Changelog
All notable changes to this project are documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).
## [Unreleased]
### Added
- `--urlFile` browsertrix crawler parameter
- `--depth` browsertrix crawler parameter
- `--extraHops`, parameter
- `--collection` browsertrix crawler parameter
- `--allowHashUrls` browsertrix crawler parameter
- `--userAgentSuffix` browsertrix crawler parameter
- `--behaviors`, parameter
- `--behaviorTimeout` browsertrix crawler parameter
- `--profile` browsertrix crawler parameter
- `--sizeLimit` browsertrix crawler parameter
- `--timeLimit` browsertrix crawler parameter
- `--healthCheckPort`, parameter
- `--overwrite` parameter
### Changed
- using browsertrix-crawler `0.6.0` and warc2zim `1.4.2`
- default WARC location after crawl changed
from `collections/capture-*/archive/` to `collections/crawl-*/archive/`
### Removed
- `--scroll` browsertrix crawler parameter (see `--behaviors`)
- `--scope` browsertrix crawler parameter (see `--scopeType`, `--include` and `--exclude`)
## [1.1.5]
- using crawler 0.3.2 and warc2zim 1.3.6
# 1.1.4
## [1.1.4]
- Defaults to `load,networkidle0` for waitUntil param (same as crawler)
- Allows setting combinations of values for waitUntil param
@ -11,23 +49,23 @@
- Warc to zim now written to `{temp_root_dir}/collections/capture-*/archive/` where
`capture-*` is dynamic and includes the datetime. (from browsertrix-crawler)
# 1.1.3
## [1.1.3]
- allows same first-level-domain redirects
- fixed redirects to URL in scope
- updated crawler to 0.2.0
- `statsFilename` now informs whether limit was hit or not
# 1.1.2
## [1.1.2]
- added support for --custom-css
- added domains block list (dfault)
# 1.1.1
## [1.1.1]
- updated browsertrix-crawler to 0.1.4
- autofetcher script to be injected by defaultDriver to capture srcsets + URLs in dynamically added stylesheets
# 1.0
## [1.0]
- initial version using browsertrix-crawler:0.1.3 and warc2zim:1.3.3

View File

@ -1,16 +1,14 @@
FROM webrecorder/browsertrix-crawler:0.3.2
FROM webrecorder/browsertrix-crawler:0.6.0
LABEL org.opencontainers.image.source https://github.com/openzim/zimit
RUN apt-get update && apt-get install -qqy --no-install-recommends libmagic1 && apt-get clean && rm -rf /var/lib/apt/lists/*
RUN pip3.8 install --no-cache-dir 'requests>=2.24.0' 'inotify==0.2.10' 'tld>=0.12,<0.13' 'warc2zim==1.4.3'
RUN mkdir -p /output
WORKDIR /app
RUN pip install 'warc2zim>=1.3.6' 'requests>=2.24.0' 'inotify==0.2.10' 'tld>=0.12,<0.13'
ADD zimit.py /app/
RUN ln -s /app/zimit.py /usr/bin/zimit
# download list of bad domains to filter-out. intentionnaly ran post-install
# so it's not cached in earlier layers (url stays same but content updated)
RUN mkdir -p /tmp/ads && cd /tmp/ads && \
@ -22,5 +20,9 @@ RUN mkdir -p /tmp/ads && cd /tmp/ads && \
RUN printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh && \
chmod +x /usr/local/bin/entrypoint.sh
ADD zimit.py /app/
RUN ln -s /app/zimit.py /usr/bin/zimit
ENTRYPOINT ["entrypoint.sh"]
CMD ["zimit"]

View File

@ -7,8 +7,8 @@ from warcio import ArchiveIterator
def get_zim_article(zimfile, path):
zim_fh = libzim.reader.File(zimfile)
return zim_fh.get_article(path).content.tobytes()
zim_fh = libzim.reader.Archive(zimfile)
return zim_fh.get_entry_by_path(path).get_item().content.tobytes()
def test_is_file():
@ -29,7 +29,7 @@ def test_user_agent():
""" Test that mobile user agent was used in WARC request records with custom Zimit and email suffix"""
found = False
for warc in glob.glob("/output/.tmp*/collections/capture-*/archive/*.warc.gz"):
for warc in glob.glob("/output/.tmp*/collections/crawl-*/archive/*.warc.gz"):
with open(warc, "rb") as fh:
for record in ArchiveIterator(fh):
if record.rec_type == "request":

182
zimit.py
View File

@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3.8
# -*- coding: utf-8 -*-
# vim: ai ts=4 sts=4 et sw=4 nu
@ -125,6 +125,11 @@ def zimit(args=None):
parser.add_argument("-u", "--url", help="The URL to start crawling from")
parser.add_argument(
"--urlFile",
help="If set, read a list of seed urls, " "one per line, from the specified",
)
parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers")
parser.add_argument(
@ -142,6 +147,17 @@ def zimit(args=None):
default="load,networkidle0",
)
parser.add_argument(
"--depth", help="The depth of the crawl for all seeds", type=int, default=-1
)
parser.add_argument(
"--extraHops",
help="Number of extra 'hops' to follow, beyond the current scope",
type=int,
default=0,
)
parser.add_argument(
"--limit", help="Limit crawl to this number of pages", type=int, default=0
)
@ -154,18 +170,107 @@ def zimit(args=None):
)
parser.add_argument(
"--scope",
help="Regex of page URLs that should be included in the crawl "
"(defaults to the immediate directory of the URL)",
"--scopeType",
help="A predfined scope of the crawl. For more customization, "
"use 'custom' and set scopeIncludeRx regexes",
choices=["page", "page-spa", "prefix", "host", "domain", "any", "custom"],
)
parser.add_argument(
"--exclude", help="Regex of page URLs that should be excluded from the crawl."
"--include",
help="Regex of page URLs that should be "
"included in the crawl (defaults to "
"the immediate directory of URL)",
)
parser.add_argument(
"--scroll",
help="If set, will autoscroll to bottom of the page",
"--exclude",
help="Regex of page URLs that should be excluded from the crawl",
)
parser.add_argument(
"--collection",
help="Collection name to crawl to (replay will be accessible "
"under this name in pywb preview) instead of crawl-@ts",
)
parser.add_argument(
"--allowHashUrls",
help="Allow Hashtag URLs, useful for "
"single-page-application crawling or "
"when different hashtags load dynamic "
"content",
)
parser.add_argument(
"--mobileDevice",
help="Emulate mobile device by name from "
"https://github.com/puppeteer/puppeteer/blob"
"/main/src/common/DeviceDescriptors.ts",
)
parser.add_argument(
"--userAgent",
help="Override user-agent with specified",
)
parser.add_argument(
"--userAgentSuffix",
help="Append suffix to existing browser user-agent "
"(ex: +MyCrawler, info@example.com)",
default="+Zimit ",
)
parser.add_argument(
"--useSitemap",
help="If set, use the URL as sitemap to get additional URLs for the crawl (usually /sitemap.xml)",
)
parser.add_argument(
"--behaviors",
help="Which background behaviors to enable on each page",
default="autoplay,autofetch,siteSpecific",
)
parser.add_argument(
"--behaviorTimeout",
help="If >0, timeout (in seconds) for in-page behavior will run on each page. "
"If 0, a behavior can run until finish",
type=int,
default=90,
)
parser.add_argument(
"--profile",
help="Path to tar.gz file which will be extracted "
"and used as the browser profile",
)
parser.add_argument(
"--sizeLimit",
help="If set, save state and exit if size limit exceeds this value",
type=int,
default=0,
)
parser.add_argument(
"--timeLimit",
help="If set, save state and exit after time limit, in seconds",
type=int,
default=0,
)
parser.add_argument(
"--healthCheckPort",
help="port to run healthcheck on",
type=int,
default=0,
)
parser.add_argument(
"--overwrite",
help="overwrite current crawl data: if set, existing collection directory "
"will be deleted before crawl is started",
action="store_true",
default=False,
)
@ -182,15 +287,6 @@ def zimit(args=None):
parser.add_argument("--adminEmail", help="Admin Email for Zimit crawler")
parser.add_argument(
"--mobileDevice", help="Crawl as Mobile Device", nargs="?", const="iPhone X"
)
parser.add_argument(
"--useSitemap",
help="If set, use the URL as sitemap to get additional URLs for the crawl (usually /sitemap.xml)",
)
parser.add_argument(
"--custom-css",
help="[warc2zim] Custom CSS file URL/path to inject into all articles",
@ -211,7 +307,7 @@ def zimit(args=None):
url = zimit_args.url
if url:
url = check_url(url, zimit_args.scope)
url = check_url(url, zimit_args.scopeType)
warc2zim_args.append("--url")
warc2zim_args.append(url)
@ -244,7 +340,7 @@ def zimit(args=None):
cmd_args.append("--url")
cmd_args.append(url)
user_agent_suffix = "+Zimit "
user_agent_suffix = zimit_args.userAgentSuffix
if zimit_args.adminEmail:
user_agent_suffix += zimit_args.adminEmail
@ -277,9 +373,13 @@ def zimit(args=None):
f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}"
)
print(f"Running browsertrix-crawler crawl: {cmd_line}", flush=True)
subprocess.run(cmd_args, check=True)
crawl = subprocess.run(cmd_args)
if crawl.returncode == 11:
print("crawl interupted by a limit")
elif crawl.returncode != 0:
raise subprocess.CalledProcessError(crawl.returncode, cmd_args)
warc_files = list(temp_root_dir.rglob("collections/capture-*/archive/"))[-1]
warc_files = list(temp_root_dir.rglob("collections/crawl-*/archive/"))[-1]
warc2zim_args.append(str(warc_files))
num_files = sum(1 for e in warc_files.iterdir())
@ -300,22 +400,21 @@ def check_url(url, scope=None):
actual_url = resp.url
if actual_url != url:
# redirect on same domain or same first-level domain
if get_fld(url) == get_fld(actual_url):
if scope in (None, "any"):
return actual_url
# is it in scope?
if scope:
try:
if re.match(scope, actual_url):
return actual_url
except Exception as exc:
print(f"failed to parse your scope regexp for url checking: {exc}")
raise ValueError(
f"Main page URL ({url}) redirects to out-of-scope domain "
f"({actual_url}), cancelling crawl"
print(
"[WARN] Your URL ({0}) redirects to {1} which {2} on same "
"first-level domain. Depending on your scopeType ({3}), "
"your homepage might be out-of-scope. Please check!".format(
url,
actual_url,
"is" if get_fld(url) == get_fld(actual_url) else "is not",
scope,
)
)
return actual_url
return url
@ -326,13 +425,26 @@ def get_node_cmd_line(args):
"workers",
"newContext",
"waitUntil",
"urlFile",
"depth",
"extraHops",
"limit",
"timeout",
"scope",
"scopeType",
"include",
"exclude",
"scroll",
"collection",
"allowHashUrls",
"mobileDevice",
"userAgent",
"useSitemap",
"behaviors",
"behaviorTimeout",
"profile",
"sizeLimit",
"timeLimit",
"healthCheckPort",
"overwrite",
]:
value = getattr(args, arg)
if value: