From acf0aaf552b651803c359c7a053b0be16500f941 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 29 Oct 2021 21:19:11 +0000 Subject: [PATCH 1/4] update to latest browsertrix-crawler test with dev build of warc2zim 1.4.0 release --- Dockerfile | 13 ++++++++----- zimit.py | 2 +- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index 022c3b1..b925b14 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,15 +1,14 @@ -FROM webrecorder/browsertrix-crawler:0.3.2 +FROM webrecorder/browsertrix-crawler:0.5.0-beta.0 LABEL org.opencontainers.image.source https://github.com/openzim/zimit RUN mkdir -p /output WORKDIR /app -RUN pip install 'warc2zim>=1.3.6' 'requests>=2.24.0' 'inotify==0.2.10' 'tld>=0.12,<0.13' +RUN apt-get update && apt-get install -qqy libmagic1 -ADD zimit.py /app/ - -RUN ln -s /app/zimit.py /usr/bin/zimit +RUN pip3.8 install 'requests>=2.24.0' 'inotify==0.2.10' 'tld>=0.12,<0.13' +RUN pip3.8 install git+https://github.com/openzim/warc2zim@video-replay-fixes#A # download list of bad domains to filter-out. intentionnaly ran post-install # so it's not cached in earlier layers (url stays same but content updated) @@ -22,5 +21,9 @@ RUN mkdir -p /tmp/ads && cd /tmp/ads && \ RUN printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh && \ chmod +x /usr/local/bin/entrypoint.sh +ADD zimit.py /app/ + +RUN ln -s /app/zimit.py /usr/bin/zimit + ENTRYPOINT ["entrypoint.sh"] CMD ["zimit"] diff --git a/zimit.py b/zimit.py index 19b68f3..1fc4a24 100755 --- a/zimit.py +++ b/zimit.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env /usr/bin/python3.8 # -*- coding: utf-8 -*- # vim: ai ts=4 sts=4 et sw=4 nu From 8b5eeb31c7cbad5feebbc921cc52c44d839ecba8 Mon Sep 17 00:00:00 2001 From: renaud gaudin Date: Tue, 14 Jun 2022 14:31:40 +0000 Subject: [PATCH 2/4] using crawler 0.6beta1 --- Dockerfile | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index b925b14..cd1b450 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,15 +1,14 @@ -FROM webrecorder/browsertrix-crawler:0.5.0-beta.0 +FROM webrecorder/browsertrix-crawler:0.6.0-beta.1 LABEL org.opencontainers.image.source https://github.com/openzim/zimit +RUN apt-get update && apt-get install -qqy --no-install-recommends libmagic1 && apt-get clean && rm -rf /var/lib/apt/lists/* + +RUN pip3.8 install --no-cache-dir 'requests>=2.24.0' 'inotify==0.2.10' 'tld>=0.12,<0.13' 'warc2zim==1.4.0' + RUN mkdir -p /output WORKDIR /app -RUN apt-get update && apt-get install -qqy libmagic1 - -RUN pip3.8 install 'requests>=2.24.0' 'inotify==0.2.10' 'tld>=0.12,<0.13' -RUN pip3.8 install git+https://github.com/openzim/warc2zim@video-replay-fixes#A - # download list of bad domains to filter-out. intentionnaly ran post-install # so it's not cached in earlier layers (url stays same but content updated) RUN mkdir -p /tmp/ads && cd /tmp/ads && \ From 1f490ace8fc9938fe83418e30cab8a57503f69f1 Mon Sep 17 00:00:00 2001 From: renaud gaudin Date: Tue, 21 Jun 2022 12:04:56 +0000 Subject: [PATCH 3/4] Updated to browsertrix-crawler 0.6 and warc2zim 1.4 --- CHANGELOG.md | 50 ++++++++++++-- Dockerfile | 4 +- zimit.py | 182 +++++++++++++++++++++++++++++++++++++++++---------- 3 files changed, 194 insertions(+), 42 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d94d8c0..711281f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,46 @@ -# 1.1.5 +## Changelog + +All notable changes to this project are documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0). + + +## [Unreleased] + +### Added + +- `--urlFile` browsertrix crawler parameter +- `--depth` browsertrix crawler parameter +- `--extraHops`, parameter +- `--collection` browsertrix crawler parameter +- `--allowHashUrls` browsertrix crawler parameter +- `--userAgentSuffix` browsertrix crawler parameter +- `--behaviors`, parameter +- `--behaviorTimeout` browsertrix crawler parameter +- `--profile` browsertrix crawler parameter +- `--sizeLimit` browsertrix crawler parameter +- `--timeLimit` browsertrix crawler parameter +- `--healthCheckPort`, parameter +- `--overwrite` parameter + +### Changed + +- using browsertrix-crawler `0.6.0` and warc2zim `1.4.2` +- default WARC location after crawl changed +from `collections/capture-*/archive/` to `collections/crawl-*/archive/` + +### Removed + +- `--scroll` browsertrix crawler parameter (see `--behaviors`) +- `--scope` browsertrix crawler parameter (see `--scopeType`, `--include` and `--exclude`) + + +## [1.1.5] - using crawler 0.3.2 and warc2zim 1.3.6 -# 1.1.4 +## [1.1.4] - Defaults to `load,networkidle0` for waitUntil param (same as crawler) - Allows setting combinations of values for waitUntil param @@ -11,23 +49,23 @@ - Warc to zim now written to `{temp_root_dir}/collections/capture-*/archive/` where `capture-*` is dynamic and includes the datetime. (from browsertrix-crawler) -# 1.1.3 +## [1.1.3] - allows same first-level-domain redirects - fixed redirects to URL in scope - updated crawler to 0.2.0 - `statsFilename` now informs whether limit was hit or not -# 1.1.2 +## [1.1.2] - added support for --custom-css - added domains block list (dfault) -# 1.1.1 +## [1.1.1] - updated browsertrix-crawler to 0.1.4 - autofetcher script to be injected by defaultDriver to capture srcsets + URLs in dynamically added stylesheets -# 1.0 +## [1.0] - initial version using browsertrix-crawler:0.1.3 and warc2zim:1.3.3 diff --git a/Dockerfile b/Dockerfile index cd1b450..c9d50aa 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,9 +1,9 @@ -FROM webrecorder/browsertrix-crawler:0.6.0-beta.1 +FROM webrecorder/browsertrix-crawler:0.6.0 LABEL org.opencontainers.image.source https://github.com/openzim/zimit RUN apt-get update && apt-get install -qqy --no-install-recommends libmagic1 && apt-get clean && rm -rf /var/lib/apt/lists/* -RUN pip3.8 install --no-cache-dir 'requests>=2.24.0' 'inotify==0.2.10' 'tld>=0.12,<0.13' 'warc2zim==1.4.0' +RUN pip3.8 install --no-cache-dir 'requests>=2.24.0' 'inotify==0.2.10' 'tld>=0.12,<0.13' 'warc2zim==1.4.3' RUN mkdir -p /output diff --git a/zimit.py b/zimit.py index 1fc4a24..103c8af 100755 --- a/zimit.py +++ b/zimit.py @@ -125,6 +125,11 @@ def zimit(args=None): parser.add_argument("-u", "--url", help="The URL to start crawling from") + parser.add_argument( + "--urlFile", + help="If set, read a list of seed urls, " "one per line, from the specified", + ) + parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers") parser.add_argument( @@ -142,6 +147,17 @@ def zimit(args=None): default="load,networkidle0", ) + parser.add_argument( + "--depth", help="The depth of the crawl for all seeds", type=int, default=-1 + ) + + parser.add_argument( + "--extraHops", + help="Number of extra 'hops' to follow, beyond the current scope", + type=int, + default=0, + ) + parser.add_argument( "--limit", help="Limit crawl to this number of pages", type=int, default=0 ) @@ -154,18 +170,107 @@ def zimit(args=None): ) parser.add_argument( - "--scope", - help="Regex of page URLs that should be included in the crawl " - "(defaults to the immediate directory of the URL)", + "--scopeType", + help="A predfined scope of the crawl. For more customization, " + "use 'custom' and set scopeIncludeRx regexes", + choices=["page", "page-spa", "prefix", "host", "domain", "any", "custom"], ) parser.add_argument( - "--exclude", help="Regex of page URLs that should be excluded from the crawl." + "--include", + help="Regex of page URLs that should be " + "included in the crawl (defaults to " + "the immediate directory of URL)", ) parser.add_argument( - "--scroll", - help="If set, will autoscroll to bottom of the page", + "--exclude", + help="Regex of page URLs that should be excluded from the crawl", + ) + + parser.add_argument( + "--collection", + help="Collection name to crawl to (replay will be accessible " + "under this name in pywb preview) instead of crawl-@ts", + ) + + parser.add_argument( + "--allowHashUrls", + help="Allow Hashtag URLs, useful for " + "single-page-application crawling or " + "when different hashtags load dynamic " + "content", + ) + + parser.add_argument( + "--mobileDevice", + help="Emulate mobile device by name from " + "https://github.com/puppeteer/puppeteer/blob" + "/main/src/common/DeviceDescriptors.ts", + ) + + parser.add_argument( + "--userAgent", + help="Override user-agent with specified", + ) + + parser.add_argument( + "--userAgentSuffix", + help="Append suffix to existing browser user-agent " + "(ex: +MyCrawler, info@example.com)", + default="+Zimit ", + ) + + parser.add_argument( + "--useSitemap", + help="If set, use the URL as sitemap to get additional URLs for the crawl (usually /sitemap.xml)", + ) + + parser.add_argument( + "--behaviors", + help="Which background behaviors to enable on each page", + default="autoplay,autofetch,siteSpecific", + ) + + parser.add_argument( + "--behaviorTimeout", + help="If >0, timeout (in seconds) for in-page behavior will run on each page. " + "If 0, a behavior can run until finish", + type=int, + default=90, + ) + + parser.add_argument( + "--profile", + help="Path to tar.gz file which will be extracted " + "and used as the browser profile", + ) + + parser.add_argument( + "--sizeLimit", + help="If set, save state and exit if size limit exceeds this value", + type=int, + default=0, + ) + + parser.add_argument( + "--timeLimit", + help="If set, save state and exit after time limit, in seconds", + type=int, + default=0, + ) + + parser.add_argument( + "--healthCheckPort", + help="port to run healthcheck on", + type=int, + default=0, + ) + + parser.add_argument( + "--overwrite", + help="overwrite current crawl data: if set, existing collection directory " + "will be deleted before crawl is started", action="store_true", default=False, ) @@ -182,15 +287,6 @@ def zimit(args=None): parser.add_argument("--adminEmail", help="Admin Email for Zimit crawler") - parser.add_argument( - "--mobileDevice", help="Crawl as Mobile Device", nargs="?", const="iPhone X" - ) - - parser.add_argument( - "--useSitemap", - help="If set, use the URL as sitemap to get additional URLs for the crawl (usually /sitemap.xml)", - ) - parser.add_argument( "--custom-css", help="[warc2zim] Custom CSS file URL/path to inject into all articles", @@ -211,7 +307,7 @@ def zimit(args=None): url = zimit_args.url if url: - url = check_url(url, zimit_args.scope) + url = check_url(url, zimit_args.scopeType) warc2zim_args.append("--url") warc2zim_args.append(url) @@ -244,7 +340,7 @@ def zimit(args=None): cmd_args.append("--url") cmd_args.append(url) - user_agent_suffix = "+Zimit " + user_agent_suffix = zimit_args.userAgentSuffix if zimit_args.adminEmail: user_agent_suffix += zimit_args.adminEmail @@ -277,9 +373,15 @@ def zimit(args=None): f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}" ) print(f"Running browsertrix-crawler crawl: {cmd_line}", flush=True) - subprocess.run(cmd_args, check=True) + crawl = subprocess.run(cmd_args) + if crawl.returncode == 11: + print("crawl interupted by a limit") + elif crawl.returncode != 0: + raise subprocess.CalledProcessError( + f"returned non-zero exit status {crawl.returncode}" + ) - warc_files = list(temp_root_dir.rglob("collections/capture-*/archive/"))[-1] + warc_files = list(temp_root_dir.rglob("collections/crawl-*/archive/"))[-1] warc2zim_args.append(str(warc_files)) num_files = sum(1 for e in warc_files.iterdir()) @@ -300,23 +402,22 @@ def check_url(url, scope=None): actual_url = resp.url if actual_url != url: - # redirect on same domain or same first-level domain - if get_fld(url) == get_fld(actual_url): + if scope in (None, "any"): return actual_url - # is it in scope? - if scope: - try: - if re.match(scope, actual_url): - return actual_url - except Exception as exc: - print(f"failed to parse your scope regexp for url checking: {exc}") - - raise ValueError( - f"Main page URL ({url}) redirects to out-of-scope domain " - f"({actual_url}), cancelling crawl" + print( + "[WARN] Your URL ({0}) redirects to {1} which {2} on same " + "first-level domain. Depending on your scopeType ({3}), " + "your homepage might be out-of-scope. Please check!".format( + url, + actual_url, + "is" if get_fld(url) == get_fld(actual_url) else "is not", + scope, + ) ) + return actual_url + return url @@ -326,13 +427,26 @@ def get_node_cmd_line(args): "workers", "newContext", "waitUntil", + "urlFile", + "depth", + "extraHops", "limit", "timeout", - "scope", + "scopeType", + "include", "exclude", - "scroll", + "collection", + "allowHashUrls", "mobileDevice", + "userAgent", "useSitemap", + "behaviors", + "behaviorTimeout", + "profile", + "sizeLimit", + "timeLimit", + "healthCheckPort", + "overwrite", ]: value = getattr(args, arg) if value: From 932f97c9994783796490d97068002dc7ba845360 Mon Sep 17 00:00:00 2001 From: renaud gaudin Date: Tue, 21 Jun 2022 16:43:32 +0000 Subject: [PATCH 4/4] updated tests for crawler and warc2zim --- .github/workflows/ci.yaml | 2 +- test/integration.py | 6 +++--- zimit.py | 6 ++---- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index d148161..713f7ff 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -13,7 +13,7 @@ jobs: run: docker build -t zimit . - name: run crawl - run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice --statsFilename /output/stats.json --keep + run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "iPhone 11" --statsFilename /output/stats.json --keep - name: run integration test suite run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "pip install pytest; pytest -v ./integration.py" diff --git a/test/integration.py b/test/integration.py index e0cf072..eefc035 100644 --- a/test/integration.py +++ b/test/integration.py @@ -7,8 +7,8 @@ from warcio import ArchiveIterator def get_zim_article(zimfile, path): - zim_fh = libzim.reader.File(zimfile) - return zim_fh.get_article(path).content.tobytes() + zim_fh = libzim.reader.Archive(zimfile) + return zim_fh.get_entry_by_path(path).get_item().content.tobytes() def test_is_file(): @@ -29,7 +29,7 @@ def test_user_agent(): """ Test that mobile user agent was used in WARC request records with custom Zimit and email suffix""" found = False - for warc in glob.glob("/output/.tmp*/collections/capture-*/archive/*.warc.gz"): + for warc in glob.glob("/output/.tmp*/collections/crawl-*/archive/*.warc.gz"): with open(warc, "rb") as fh: for record in ArchiveIterator(fh): if record.rec_type == "request": diff --git a/zimit.py b/zimit.py index 103c8af..f0b360e 100755 --- a/zimit.py +++ b/zimit.py @@ -1,4 +1,4 @@ -#!/usr/bin/env /usr/bin/python3.8 +#!/usr/bin/env python3.8 # -*- coding: utf-8 -*- # vim: ai ts=4 sts=4 et sw=4 nu @@ -377,9 +377,7 @@ def zimit(args=None): if crawl.returncode == 11: print("crawl interupted by a limit") elif crawl.returncode != 0: - raise subprocess.CalledProcessError( - f"returned non-zero exit status {crawl.returncode}" - ) + raise subprocess.CalledProcessError(crawl.returncode, cmd_args) warc_files = list(temp_root_dir.rglob("collections/crawl-*/archive/"))[-1] warc2zim_args.append(str(warc_files))