Merge pull request #108 from openzim/crawler-with-video

update to latest browsertrix-crawler + warc2zim
This commit is contained in:
rgaudin 2022-06-21 16:59:15 +00:00 committed by GitHub
commit b2bb77cd65
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 204 additions and 52 deletions

View File

@ -13,7 +13,7 @@ jobs:
run: docker build -t zimit . run: docker build -t zimit .
- name: run crawl - name: run crawl
run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice --statsFilename /output/stats.json --keep run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "iPhone 11" --statsFilename /output/stats.json --keep
- name: run integration test suite - name: run integration test suite
run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "pip install pytest; pytest -v ./integration.py" run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "pip install pytest; pytest -v ./integration.py"

View File

@ -1,8 +1,46 @@
# 1.1.5 ## Changelog
All notable changes to this project are documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).
## [Unreleased]
### Added
- `--urlFile` browsertrix crawler parameter
- `--depth` browsertrix crawler parameter
- `--extraHops`, parameter
- `--collection` browsertrix crawler parameter
- `--allowHashUrls` browsertrix crawler parameter
- `--userAgentSuffix` browsertrix crawler parameter
- `--behaviors`, parameter
- `--behaviorTimeout` browsertrix crawler parameter
- `--profile` browsertrix crawler parameter
- `--sizeLimit` browsertrix crawler parameter
- `--timeLimit` browsertrix crawler parameter
- `--healthCheckPort`, parameter
- `--overwrite` parameter
### Changed
- using browsertrix-crawler `0.6.0` and warc2zim `1.4.2`
- default WARC location after crawl changed
from `collections/capture-*/archive/` to `collections/crawl-*/archive/`
### Removed
- `--scroll` browsertrix crawler parameter (see `--behaviors`)
- `--scope` browsertrix crawler parameter (see `--scopeType`, `--include` and `--exclude`)
## [1.1.5]
- using crawler 0.3.2 and warc2zim 1.3.6 - using crawler 0.3.2 and warc2zim 1.3.6
# 1.1.4 ## [1.1.4]
- Defaults to `load,networkidle0` for waitUntil param (same as crawler) - Defaults to `load,networkidle0` for waitUntil param (same as crawler)
- Allows setting combinations of values for waitUntil param - Allows setting combinations of values for waitUntil param
@ -11,23 +49,23 @@
- Warc to zim now written to `{temp_root_dir}/collections/capture-*/archive/` where - Warc to zim now written to `{temp_root_dir}/collections/capture-*/archive/` where
`capture-*` is dynamic and includes the datetime. (from browsertrix-crawler) `capture-*` is dynamic and includes the datetime. (from browsertrix-crawler)
# 1.1.3 ## [1.1.3]
- allows same first-level-domain redirects - allows same first-level-domain redirects
- fixed redirects to URL in scope - fixed redirects to URL in scope
- updated crawler to 0.2.0 - updated crawler to 0.2.0
- `statsFilename` now informs whether limit was hit or not - `statsFilename` now informs whether limit was hit or not
# 1.1.2 ## [1.1.2]
- added support for --custom-css - added support for --custom-css
- added domains block list (dfault) - added domains block list (dfault)
# 1.1.1 ## [1.1.1]
- updated browsertrix-crawler to 0.1.4 - updated browsertrix-crawler to 0.1.4
- autofetcher script to be injected by defaultDriver to capture srcsets + URLs in dynamically added stylesheets - autofetcher script to be injected by defaultDriver to capture srcsets + URLs in dynamically added stylesheets
# 1.0 ## [1.0]
- initial version using browsertrix-crawler:0.1.3 and warc2zim:1.3.3 - initial version using browsertrix-crawler:0.1.3 and warc2zim:1.3.3

View File

@ -1,16 +1,14 @@
FROM webrecorder/browsertrix-crawler:0.3.2 FROM webrecorder/browsertrix-crawler:0.6.0
LABEL org.opencontainers.image.source https://github.com/openzim/zimit LABEL org.opencontainers.image.source https://github.com/openzim/zimit
RUN apt-get update && apt-get install -qqy --no-install-recommends libmagic1 && apt-get clean && rm -rf /var/lib/apt/lists/*
RUN pip3.8 install --no-cache-dir 'requests>=2.24.0' 'inotify==0.2.10' 'tld>=0.12,<0.13' 'warc2zim==1.4.3'
RUN mkdir -p /output RUN mkdir -p /output
WORKDIR /app WORKDIR /app
RUN pip install 'warc2zim>=1.3.6' 'requests>=2.24.0' 'inotify==0.2.10' 'tld>=0.12,<0.13'
ADD zimit.py /app/
RUN ln -s /app/zimit.py /usr/bin/zimit
# download list of bad domains to filter-out. intentionnaly ran post-install # download list of bad domains to filter-out. intentionnaly ran post-install
# so it's not cached in earlier layers (url stays same but content updated) # so it's not cached in earlier layers (url stays same but content updated)
RUN mkdir -p /tmp/ads && cd /tmp/ads && \ RUN mkdir -p /tmp/ads && cd /tmp/ads && \
@ -22,5 +20,9 @@ RUN mkdir -p /tmp/ads && cd /tmp/ads && \
RUN printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh && \ RUN printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh && \
chmod +x /usr/local/bin/entrypoint.sh chmod +x /usr/local/bin/entrypoint.sh
ADD zimit.py /app/
RUN ln -s /app/zimit.py /usr/bin/zimit
ENTRYPOINT ["entrypoint.sh"] ENTRYPOINT ["entrypoint.sh"]
CMD ["zimit"] CMD ["zimit"]

View File

@ -7,8 +7,8 @@ from warcio import ArchiveIterator
def get_zim_article(zimfile, path): def get_zim_article(zimfile, path):
zim_fh = libzim.reader.File(zimfile) zim_fh = libzim.reader.Archive(zimfile)
return zim_fh.get_article(path).content.tobytes() return zim_fh.get_entry_by_path(path).get_item().content.tobytes()
def test_is_file(): def test_is_file():
@ -29,7 +29,7 @@ def test_user_agent():
""" Test that mobile user agent was used in WARC request records with custom Zimit and email suffix""" """ Test that mobile user agent was used in WARC request records with custom Zimit and email suffix"""
found = False found = False
for warc in glob.glob("/output/.tmp*/collections/capture-*/archive/*.warc.gz"): for warc in glob.glob("/output/.tmp*/collections/crawl-*/archive/*.warc.gz"):
with open(warc, "rb") as fh: with open(warc, "rb") as fh:
for record in ArchiveIterator(fh): for record in ArchiveIterator(fh):
if record.rec_type == "request": if record.rec_type == "request":

182
zimit.py
View File

@ -1,4 +1,4 @@
#!/usr/bin/env python #!/usr/bin/env python3.8
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vim: ai ts=4 sts=4 et sw=4 nu # vim: ai ts=4 sts=4 et sw=4 nu
@ -125,6 +125,11 @@ def zimit(args=None):
parser.add_argument("-u", "--url", help="The URL to start crawling from") parser.add_argument("-u", "--url", help="The URL to start crawling from")
parser.add_argument(
"--urlFile",
help="If set, read a list of seed urls, " "one per line, from the specified",
)
parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers") parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers")
parser.add_argument( parser.add_argument(
@ -142,6 +147,17 @@ def zimit(args=None):
default="load,networkidle0", default="load,networkidle0",
) )
parser.add_argument(
"--depth", help="The depth of the crawl for all seeds", type=int, default=-1
)
parser.add_argument(
"--extraHops",
help="Number of extra 'hops' to follow, beyond the current scope",
type=int,
default=0,
)
parser.add_argument( parser.add_argument(
"--limit", help="Limit crawl to this number of pages", type=int, default=0 "--limit", help="Limit crawl to this number of pages", type=int, default=0
) )
@ -154,18 +170,107 @@ def zimit(args=None):
) )
parser.add_argument( parser.add_argument(
"--scope", "--scopeType",
help="Regex of page URLs that should be included in the crawl " help="A predfined scope of the crawl. For more customization, "
"(defaults to the immediate directory of the URL)", "use 'custom' and set scopeIncludeRx regexes",
choices=["page", "page-spa", "prefix", "host", "domain", "any", "custom"],
) )
parser.add_argument( parser.add_argument(
"--exclude", help="Regex of page URLs that should be excluded from the crawl." "--include",
help="Regex of page URLs that should be "
"included in the crawl (defaults to "
"the immediate directory of URL)",
) )
parser.add_argument( parser.add_argument(
"--scroll", "--exclude",
help="If set, will autoscroll to bottom of the page", help="Regex of page URLs that should be excluded from the crawl",
)
parser.add_argument(
"--collection",
help="Collection name to crawl to (replay will be accessible "
"under this name in pywb preview) instead of crawl-@ts",
)
parser.add_argument(
"--allowHashUrls",
help="Allow Hashtag URLs, useful for "
"single-page-application crawling or "
"when different hashtags load dynamic "
"content",
)
parser.add_argument(
"--mobileDevice",
help="Emulate mobile device by name from "
"https://github.com/puppeteer/puppeteer/blob"
"/main/src/common/DeviceDescriptors.ts",
)
parser.add_argument(
"--userAgent",
help="Override user-agent with specified",
)
parser.add_argument(
"--userAgentSuffix",
help="Append suffix to existing browser user-agent "
"(ex: +MyCrawler, info@example.com)",
default="+Zimit ",
)
parser.add_argument(
"--useSitemap",
help="If set, use the URL as sitemap to get additional URLs for the crawl (usually /sitemap.xml)",
)
parser.add_argument(
"--behaviors",
help="Which background behaviors to enable on each page",
default="autoplay,autofetch,siteSpecific",
)
parser.add_argument(
"--behaviorTimeout",
help="If >0, timeout (in seconds) for in-page behavior will run on each page. "
"If 0, a behavior can run until finish",
type=int,
default=90,
)
parser.add_argument(
"--profile",
help="Path to tar.gz file which will be extracted "
"and used as the browser profile",
)
parser.add_argument(
"--sizeLimit",
help="If set, save state and exit if size limit exceeds this value",
type=int,
default=0,
)
parser.add_argument(
"--timeLimit",
help="If set, save state and exit after time limit, in seconds",
type=int,
default=0,
)
parser.add_argument(
"--healthCheckPort",
help="port to run healthcheck on",
type=int,
default=0,
)
parser.add_argument(
"--overwrite",
help="overwrite current crawl data: if set, existing collection directory "
"will be deleted before crawl is started",
action="store_true", action="store_true",
default=False, default=False,
) )
@ -182,15 +287,6 @@ def zimit(args=None):
parser.add_argument("--adminEmail", help="Admin Email for Zimit crawler") parser.add_argument("--adminEmail", help="Admin Email for Zimit crawler")
parser.add_argument(
"--mobileDevice", help="Crawl as Mobile Device", nargs="?", const="iPhone X"
)
parser.add_argument(
"--useSitemap",
help="If set, use the URL as sitemap to get additional URLs for the crawl (usually /sitemap.xml)",
)
parser.add_argument( parser.add_argument(
"--custom-css", "--custom-css",
help="[warc2zim] Custom CSS file URL/path to inject into all articles", help="[warc2zim] Custom CSS file URL/path to inject into all articles",
@ -211,7 +307,7 @@ def zimit(args=None):
url = zimit_args.url url = zimit_args.url
if url: if url:
url = check_url(url, zimit_args.scope) url = check_url(url, zimit_args.scopeType)
warc2zim_args.append("--url") warc2zim_args.append("--url")
warc2zim_args.append(url) warc2zim_args.append(url)
@ -244,7 +340,7 @@ def zimit(args=None):
cmd_args.append("--url") cmd_args.append("--url")
cmd_args.append(url) cmd_args.append(url)
user_agent_suffix = "+Zimit " user_agent_suffix = zimit_args.userAgentSuffix
if zimit_args.adminEmail: if zimit_args.adminEmail:
user_agent_suffix += zimit_args.adminEmail user_agent_suffix += zimit_args.adminEmail
@ -277,9 +373,13 @@ def zimit(args=None):
f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}" f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}"
) )
print(f"Running browsertrix-crawler crawl: {cmd_line}", flush=True) print(f"Running browsertrix-crawler crawl: {cmd_line}", flush=True)
subprocess.run(cmd_args, check=True) crawl = subprocess.run(cmd_args)
if crawl.returncode == 11:
print("crawl interupted by a limit")
elif crawl.returncode != 0:
raise subprocess.CalledProcessError(crawl.returncode, cmd_args)
warc_files = list(temp_root_dir.rglob("collections/capture-*/archive/"))[-1] warc_files = list(temp_root_dir.rglob("collections/crawl-*/archive/"))[-1]
warc2zim_args.append(str(warc_files)) warc2zim_args.append(str(warc_files))
num_files = sum(1 for e in warc_files.iterdir()) num_files = sum(1 for e in warc_files.iterdir())
@ -300,22 +400,21 @@ def check_url(url, scope=None):
actual_url = resp.url actual_url = resp.url
if actual_url != url: if actual_url != url:
# redirect on same domain or same first-level domain if scope in (None, "any"):
if get_fld(url) == get_fld(actual_url):
return actual_url return actual_url
# is it in scope? print(
if scope: "[WARN] Your URL ({0}) redirects to {1} which {2} on same "
try: "first-level domain. Depending on your scopeType ({3}), "
if re.match(scope, actual_url): "your homepage might be out-of-scope. Please check!".format(
return actual_url url,
except Exception as exc: actual_url,
print(f"failed to parse your scope regexp for url checking: {exc}") "is" if get_fld(url) == get_fld(actual_url) else "is not",
scope,
raise ValueError(
f"Main page URL ({url}) redirects to out-of-scope domain "
f"({actual_url}), cancelling crawl"
) )
)
return actual_url
return url return url
@ -326,13 +425,26 @@ def get_node_cmd_line(args):
"workers", "workers",
"newContext", "newContext",
"waitUntil", "waitUntil",
"urlFile",
"depth",
"extraHops",
"limit", "limit",
"timeout", "timeout",
"scope", "scopeType",
"include",
"exclude", "exclude",
"scroll", "collection",
"allowHashUrls",
"mobileDevice", "mobileDevice",
"userAgent",
"useSitemap", "useSitemap",
"behaviors",
"behaviorTimeout",
"profile",
"sizeLimit",
"timeLimit",
"healthCheckPort",
"overwrite",
]: ]:
value = getattr(args, arg) value = getattr(args, arg)
if value: if value: