From 1f490ace8fc9938fe83418e30cab8a57503f69f1 Mon Sep 17 00:00:00 2001 From: renaud gaudin Date: Tue, 21 Jun 2022 12:04:56 +0000 Subject: [PATCH] Updated to browsertrix-crawler 0.6 and warc2zim 1.4 --- CHANGELOG.md | 50 ++++++++++++-- Dockerfile | 4 +- zimit.py | 182 +++++++++++++++++++++++++++++++++++++++++---------- 3 files changed, 194 insertions(+), 42 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d94d8c0..711281f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,46 @@ -# 1.1.5 +## Changelog + +All notable changes to this project are documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0). + + +## [Unreleased] + +### Added + +- `--urlFile` browsertrix crawler parameter +- `--depth` browsertrix crawler parameter +- `--extraHops`, parameter +- `--collection` browsertrix crawler parameter +- `--allowHashUrls` browsertrix crawler parameter +- `--userAgentSuffix` browsertrix crawler parameter +- `--behaviors`, parameter +- `--behaviorTimeout` browsertrix crawler parameter +- `--profile` browsertrix crawler parameter +- `--sizeLimit` browsertrix crawler parameter +- `--timeLimit` browsertrix crawler parameter +- `--healthCheckPort`, parameter +- `--overwrite` parameter + +### Changed + +- using browsertrix-crawler `0.6.0` and warc2zim `1.4.2` +- default WARC location after crawl changed +from `collections/capture-*/archive/` to `collections/crawl-*/archive/` + +### Removed + +- `--scroll` browsertrix crawler parameter (see `--behaviors`) +- `--scope` browsertrix crawler parameter (see `--scopeType`, `--include` and `--exclude`) + + +## [1.1.5] - using crawler 0.3.2 and warc2zim 1.3.6 -# 1.1.4 +## [1.1.4] - Defaults to `load,networkidle0` for waitUntil param (same as crawler) - Allows setting combinations of values for waitUntil param @@ -11,23 +49,23 @@ - Warc to zim now written to `{temp_root_dir}/collections/capture-*/archive/` where `capture-*` is dynamic and includes the datetime. (from browsertrix-crawler) -# 1.1.3 +## [1.1.3] - allows same first-level-domain redirects - fixed redirects to URL in scope - updated crawler to 0.2.0 - `statsFilename` now informs whether limit was hit or not -# 1.1.2 +## [1.1.2] - added support for --custom-css - added domains block list (dfault) -# 1.1.1 +## [1.1.1] - updated browsertrix-crawler to 0.1.4 - autofetcher script to be injected by defaultDriver to capture srcsets + URLs in dynamically added stylesheets -# 1.0 +## [1.0] - initial version using browsertrix-crawler:0.1.3 and warc2zim:1.3.3 diff --git a/Dockerfile b/Dockerfile index cd1b450..c9d50aa 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,9 +1,9 @@ -FROM webrecorder/browsertrix-crawler:0.6.0-beta.1 +FROM webrecorder/browsertrix-crawler:0.6.0 LABEL org.opencontainers.image.source https://github.com/openzim/zimit RUN apt-get update && apt-get install -qqy --no-install-recommends libmagic1 && apt-get clean && rm -rf /var/lib/apt/lists/* -RUN pip3.8 install --no-cache-dir 'requests>=2.24.0' 'inotify==0.2.10' 'tld>=0.12,<0.13' 'warc2zim==1.4.0' +RUN pip3.8 install --no-cache-dir 'requests>=2.24.0' 'inotify==0.2.10' 'tld>=0.12,<0.13' 'warc2zim==1.4.3' RUN mkdir -p /output diff --git a/zimit.py b/zimit.py index 1fc4a24..103c8af 100755 --- a/zimit.py +++ b/zimit.py @@ -125,6 +125,11 @@ def zimit(args=None): parser.add_argument("-u", "--url", help="The URL to start crawling from") + parser.add_argument( + "--urlFile", + help="If set, read a list of seed urls, " "one per line, from the specified", + ) + parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers") parser.add_argument( @@ -142,6 +147,17 @@ def zimit(args=None): default="load,networkidle0", ) + parser.add_argument( + "--depth", help="The depth of the crawl for all seeds", type=int, default=-1 + ) + + parser.add_argument( + "--extraHops", + help="Number of extra 'hops' to follow, beyond the current scope", + type=int, + default=0, + ) + parser.add_argument( "--limit", help="Limit crawl to this number of pages", type=int, default=0 ) @@ -154,18 +170,107 @@ def zimit(args=None): ) parser.add_argument( - "--scope", - help="Regex of page URLs that should be included in the crawl " - "(defaults to the immediate directory of the URL)", + "--scopeType", + help="A predfined scope of the crawl. For more customization, " + "use 'custom' and set scopeIncludeRx regexes", + choices=["page", "page-spa", "prefix", "host", "domain", "any", "custom"], ) parser.add_argument( - "--exclude", help="Regex of page URLs that should be excluded from the crawl." + "--include", + help="Regex of page URLs that should be " + "included in the crawl (defaults to " + "the immediate directory of URL)", ) parser.add_argument( - "--scroll", - help="If set, will autoscroll to bottom of the page", + "--exclude", + help="Regex of page URLs that should be excluded from the crawl", + ) + + parser.add_argument( + "--collection", + help="Collection name to crawl to (replay will be accessible " + "under this name in pywb preview) instead of crawl-@ts", + ) + + parser.add_argument( + "--allowHashUrls", + help="Allow Hashtag URLs, useful for " + "single-page-application crawling or " + "when different hashtags load dynamic " + "content", + ) + + parser.add_argument( + "--mobileDevice", + help="Emulate mobile device by name from " + "https://github.com/puppeteer/puppeteer/blob" + "/main/src/common/DeviceDescriptors.ts", + ) + + parser.add_argument( + "--userAgent", + help="Override user-agent with specified", + ) + + parser.add_argument( + "--userAgentSuffix", + help="Append suffix to existing browser user-agent " + "(ex: +MyCrawler, info@example.com)", + default="+Zimit ", + ) + + parser.add_argument( + "--useSitemap", + help="If set, use the URL as sitemap to get additional URLs for the crawl (usually /sitemap.xml)", + ) + + parser.add_argument( + "--behaviors", + help="Which background behaviors to enable on each page", + default="autoplay,autofetch,siteSpecific", + ) + + parser.add_argument( + "--behaviorTimeout", + help="If >0, timeout (in seconds) for in-page behavior will run on each page. " + "If 0, a behavior can run until finish", + type=int, + default=90, + ) + + parser.add_argument( + "--profile", + help="Path to tar.gz file which will be extracted " + "and used as the browser profile", + ) + + parser.add_argument( + "--sizeLimit", + help="If set, save state and exit if size limit exceeds this value", + type=int, + default=0, + ) + + parser.add_argument( + "--timeLimit", + help="If set, save state and exit after time limit, in seconds", + type=int, + default=0, + ) + + parser.add_argument( + "--healthCheckPort", + help="port to run healthcheck on", + type=int, + default=0, + ) + + parser.add_argument( + "--overwrite", + help="overwrite current crawl data: if set, existing collection directory " + "will be deleted before crawl is started", action="store_true", default=False, ) @@ -182,15 +287,6 @@ def zimit(args=None): parser.add_argument("--adminEmail", help="Admin Email for Zimit crawler") - parser.add_argument( - "--mobileDevice", help="Crawl as Mobile Device", nargs="?", const="iPhone X" - ) - - parser.add_argument( - "--useSitemap", - help="If set, use the URL as sitemap to get additional URLs for the crawl (usually /sitemap.xml)", - ) - parser.add_argument( "--custom-css", help="[warc2zim] Custom CSS file URL/path to inject into all articles", @@ -211,7 +307,7 @@ def zimit(args=None): url = zimit_args.url if url: - url = check_url(url, zimit_args.scope) + url = check_url(url, zimit_args.scopeType) warc2zim_args.append("--url") warc2zim_args.append(url) @@ -244,7 +340,7 @@ def zimit(args=None): cmd_args.append("--url") cmd_args.append(url) - user_agent_suffix = "+Zimit " + user_agent_suffix = zimit_args.userAgentSuffix if zimit_args.adminEmail: user_agent_suffix += zimit_args.adminEmail @@ -277,9 +373,15 @@ def zimit(args=None): f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}" ) print(f"Running browsertrix-crawler crawl: {cmd_line}", flush=True) - subprocess.run(cmd_args, check=True) + crawl = subprocess.run(cmd_args) + if crawl.returncode == 11: + print("crawl interupted by a limit") + elif crawl.returncode != 0: + raise subprocess.CalledProcessError( + f"returned non-zero exit status {crawl.returncode}" + ) - warc_files = list(temp_root_dir.rglob("collections/capture-*/archive/"))[-1] + warc_files = list(temp_root_dir.rglob("collections/crawl-*/archive/"))[-1] warc2zim_args.append(str(warc_files)) num_files = sum(1 for e in warc_files.iterdir()) @@ -300,23 +402,22 @@ def check_url(url, scope=None): actual_url = resp.url if actual_url != url: - # redirect on same domain or same first-level domain - if get_fld(url) == get_fld(actual_url): + if scope in (None, "any"): return actual_url - # is it in scope? - if scope: - try: - if re.match(scope, actual_url): - return actual_url - except Exception as exc: - print(f"failed to parse your scope regexp for url checking: {exc}") - - raise ValueError( - f"Main page URL ({url}) redirects to out-of-scope domain " - f"({actual_url}), cancelling crawl" + print( + "[WARN] Your URL ({0}) redirects to {1} which {2} on same " + "first-level domain. Depending on your scopeType ({3}), " + "your homepage might be out-of-scope. Please check!".format( + url, + actual_url, + "is" if get_fld(url) == get_fld(actual_url) else "is not", + scope, + ) ) + return actual_url + return url @@ -326,13 +427,26 @@ def get_node_cmd_line(args): "workers", "newContext", "waitUntil", + "urlFile", + "depth", + "extraHops", "limit", "timeout", - "scope", + "scopeType", + "include", "exclude", - "scroll", + "collection", + "allowHashUrls", "mobileDevice", + "userAgent", "useSitemap", + "behaviors", + "behaviorTimeout", + "profile", + "sizeLimit", + "timeLimit", + "healthCheckPort", + "overwrite", ]: value = getattr(args, arg) if value: