From acf0aaf552b651803c359c7a053b0be16500f941 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@gmail.com>
Date: Fri, 29 Oct 2021 21:19:11 +0000
Subject: [PATCH 1/4] update to latest browsertrix-crawler test with dev build
 of warc2zim 1.4.0 release

---
 Dockerfile | 13 ++++++++-----
 zimit.py   |  2 +-
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 022c3b1..b925b14 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,15 +1,14 @@
-FROM webrecorder/browsertrix-crawler:0.3.2
+FROM webrecorder/browsertrix-crawler:0.5.0-beta.0
 LABEL org.opencontainers.image.source https://github.com/openzim/zimit
 
 RUN mkdir -p /output
 
 WORKDIR /app
 
-RUN pip install 'warc2zim>=1.3.6' 'requests>=2.24.0' 'inotify==0.2.10' 'tld>=0.12,<0.13'
+RUN apt-get update && apt-get install -qqy libmagic1
 
-ADD zimit.py /app/
-
-RUN ln -s /app/zimit.py /usr/bin/zimit
+RUN pip3.8 install 'requests>=2.24.0' 'inotify==0.2.10' 'tld>=0.12,<0.13'
+RUN pip3.8 install git+https://github.com/openzim/warc2zim@video-replay-fixes#A
 
 # download list of bad domains to filter-out. intentionnaly ran post-install
 # so it's not cached in earlier layers (url stays same but content updated)
@@ -22,5 +21,9 @@ RUN mkdir -p /tmp/ads && cd /tmp/ads && \
 RUN printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh && \
     chmod +x /usr/local/bin/entrypoint.sh
 
+ADD zimit.py /app/
+
+RUN ln -s /app/zimit.py /usr/bin/zimit
+
 ENTRYPOINT ["entrypoint.sh"]
 CMD ["zimit"]
diff --git a/zimit.py b/zimit.py
index 19b68f3..1fc4a24 100755
--- a/zimit.py
+++ b/zimit.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env /usr/bin/python3.8
 # -*- coding: utf-8 -*-
 # vim: ai ts=4 sts=4 et sw=4 nu
 

From 8b5eeb31c7cbad5feebbc921cc52c44d839ecba8 Mon Sep 17 00:00:00 2001
From: renaud gaudin <reg@rskg.org>
Date: Tue, 14 Jun 2022 14:31:40 +0000
Subject: [PATCH 2/4] using crawler 0.6beta1

---
 Dockerfile | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index b925b14..cd1b450 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,15 +1,14 @@
-FROM webrecorder/browsertrix-crawler:0.5.0-beta.0
+FROM webrecorder/browsertrix-crawler:0.6.0-beta.1
 LABEL org.opencontainers.image.source https://github.com/openzim/zimit
 
+RUN apt-get update && apt-get install -qqy --no-install-recommends libmagic1 && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+RUN pip3.8 install --no-cache-dir 'requests>=2.24.0' 'inotify==0.2.10' 'tld>=0.12,<0.13' 'warc2zim==1.4.0'
+
 RUN mkdir -p /output
 
 WORKDIR /app
 
-RUN apt-get update && apt-get install -qqy libmagic1
-
-RUN pip3.8 install 'requests>=2.24.0' 'inotify==0.2.10' 'tld>=0.12,<0.13'
-RUN pip3.8 install git+https://github.com/openzim/warc2zim@video-replay-fixes#A
-
 # download list of bad domains to filter-out. intentionnaly ran post-install
 # so it's not cached in earlier layers (url stays same but content updated)
 RUN mkdir -p /tmp/ads && cd /tmp/ads && \

From 1f490ace8fc9938fe83418e30cab8a57503f69f1 Mon Sep 17 00:00:00 2001
From: renaud gaudin <reg@rskg.org>
Date: Tue, 21 Jun 2022 12:04:56 +0000
Subject: [PATCH 3/4] Updated to browsertrix-crawler 0.6 and warc2zim 1.4

---
 CHANGELOG.md |  50 ++++++++++++--
 Dockerfile   |   4 +-
 zimit.py     | 182 +++++++++++++++++++++++++++++++++++++++++----------
 3 files changed, 194 insertions(+), 42 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d94d8c0..711281f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,8 +1,46 @@
-# 1.1.5
+## Changelog
+
+All notable changes to this project are documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).
+
+
+## [Unreleased]
+
+### Added
+
+- `--urlFile` browsertrix crawler parameter
+- `--depth` browsertrix crawler parameter
+- `--extraHops`, parameter
+- `--collection` browsertrix crawler parameter
+- `--allowHashUrls` browsertrix crawler parameter
+- `--userAgentSuffix` browsertrix crawler parameter
+- `--behaviors`, parameter
+- `--behaviorTimeout` browsertrix crawler parameter
+- `--profile` browsertrix crawler parameter
+- `--sizeLimit` browsertrix crawler parameter
+- `--timeLimit` browsertrix crawler parameter
+- `--healthCheckPort`, parameter
+- `--overwrite` parameter
+
+### Changed
+
+- using browsertrix-crawler `0.6.0` and warc2zim `1.4.2`
+- default WARC location after crawl changed 
+from `collections/capture-*/archive/` to `collections/crawl-*/archive/`
+
+### Removed
+
+- `--scroll` browsertrix crawler parameter (see `--behaviors`)
+- `--scope` browsertrix crawler parameter (see `--scopeType`, `--include` and `--exclude`)
+
+
+## [1.1.5]
 
 - using crawler 0.3.2 and warc2zim 1.3.6
 
-# 1.1.4
+## [1.1.4]
 
 - Defaults to `load,networkidle0` for waitUntil param (same as crawler)
 - Allows setting combinations of values for waitUntil param
@@ -11,23 +49,23 @@
 - Warc to zim now written to `{temp_root_dir}/collections/capture-*/archive/` where
   `capture-*` is dynamic and includes the datetime. (from browsertrix-crawler)
 
-# 1.1.3
+## [1.1.3]
 
 - allows same first-level-domain redirects
 - fixed redirects to URL in scope
 - updated crawler to 0.2.0
 - `statsFilename` now informs whether limit was hit or not
 
-# 1.1.2
+## [1.1.2]
 
 - added support for --custom-css
 - added domains block list (dfault)
 
-# 1.1.1
+## [1.1.1]
 
 - updated browsertrix-crawler to 0.1.4
   - autofetcher script to be injected by defaultDriver to capture srcsets + URLs in dynamically added stylesheets
 
-# 1.0
+## [1.0]
 
 - initial version using browsertrix-crawler:0.1.3 and warc2zim:1.3.3
diff --git a/Dockerfile b/Dockerfile
index cd1b450..c9d50aa 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,9 +1,9 @@
-FROM webrecorder/browsertrix-crawler:0.6.0-beta.1
+FROM webrecorder/browsertrix-crawler:0.6.0
 LABEL org.opencontainers.image.source https://github.com/openzim/zimit
 
 RUN apt-get update && apt-get install -qqy --no-install-recommends libmagic1 && apt-get clean && rm -rf /var/lib/apt/lists/*
 
-RUN pip3.8 install --no-cache-dir 'requests>=2.24.0' 'inotify==0.2.10' 'tld>=0.12,<0.13' 'warc2zim==1.4.0'
+RUN pip3.8 install --no-cache-dir 'requests>=2.24.0' 'inotify==0.2.10' 'tld>=0.12,<0.13' 'warc2zim==1.4.3'
 
 RUN mkdir -p /output
 
diff --git a/zimit.py b/zimit.py
index 1fc4a24..103c8af 100755
--- a/zimit.py
+++ b/zimit.py
@@ -125,6 +125,11 @@ def zimit(args=None):
 
     parser.add_argument("-u", "--url", help="The URL to start crawling from")
 
+    parser.add_argument(
+        "--urlFile",
+        help="If set, read a list of seed urls, " "one per line, from the specified",
+    )
+
     parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers")
 
     parser.add_argument(
@@ -142,6 +147,17 @@ def zimit(args=None):
         default="load,networkidle0",
     )
 
+    parser.add_argument(
+        "--depth", help="The depth of the crawl for all seeds", type=int, default=-1
+    )
+
+    parser.add_argument(
+        "--extraHops",
+        help="Number of extra 'hops' to follow, beyond the current scope",
+        type=int,
+        default=0,
+    )
+
     parser.add_argument(
         "--limit", help="Limit crawl to this number of pages", type=int, default=0
     )
@@ -154,18 +170,107 @@ def zimit(args=None):
     )
 
     parser.add_argument(
-        "--scope",
-        help="Regex of page URLs that should be included in the crawl "
-        "(defaults to the immediate directory of the URL)",
+        "--scopeType",
+        help="A predfined scope of the crawl. For more customization, "
+        "use 'custom' and set scopeIncludeRx regexes",
+        choices=["page", "page-spa", "prefix", "host", "domain", "any", "custom"],
     )
 
     parser.add_argument(
-        "--exclude", help="Regex of page URLs that should be excluded from the crawl."
+        "--include",
+        help="Regex of page URLs that should be "
+        "included in the crawl (defaults to "
+        "the immediate directory of URL)",
     )
 
     parser.add_argument(
-        "--scroll",
-        help="If set, will autoscroll to bottom of the page",
+        "--exclude",
+        help="Regex of page URLs that should be excluded from the crawl",
+    )
+
+    parser.add_argument(
+        "--collection",
+        help="Collection name to crawl to (replay will be accessible "
+        "under this name in pywb preview) instead of crawl-@ts",
+    )
+
+    parser.add_argument(
+        "--allowHashUrls",
+        help="Allow Hashtag URLs, useful for "
+        "single-page-application crawling or "
+        "when different hashtags load dynamic "
+        "content",
+    )
+
+    parser.add_argument(
+        "--mobileDevice",
+        help="Emulate mobile device by name from "
+        "https://github.com/puppeteer/puppeteer/blob"
+        "/main/src/common/DeviceDescriptors.ts",
+    )
+
+    parser.add_argument(
+        "--userAgent",
+        help="Override user-agent with specified",
+    )
+
+    parser.add_argument(
+        "--userAgentSuffix",
+        help="Append suffix to existing browser user-agent "
+        "(ex: +MyCrawler, info@example.com)",
+        default="+Zimit ",
+    )
+
+    parser.add_argument(
+        "--useSitemap",
+        help="If set, use the URL as sitemap to get additional URLs for the crawl (usually /sitemap.xml)",
+    )
+
+    parser.add_argument(
+        "--behaviors",
+        help="Which background behaviors to enable on each page",
+        default="autoplay,autofetch,siteSpecific",
+    )
+
+    parser.add_argument(
+        "--behaviorTimeout",
+        help="If >0, timeout (in seconds) for in-page behavior will run on each page. "
+        "If 0, a behavior can run until finish",
+        type=int,
+        default=90,
+    )
+
+    parser.add_argument(
+        "--profile",
+        help="Path to tar.gz file which will be extracted "
+        "and used as the browser profile",
+    )
+
+    parser.add_argument(
+        "--sizeLimit",
+        help="If set, save state and exit if size limit exceeds this value",
+        type=int,
+        default=0,
+    )
+
+    parser.add_argument(
+        "--timeLimit",
+        help="If set, save state and exit after time limit, in seconds",
+        type=int,
+        default=0,
+    )
+
+    parser.add_argument(
+        "--healthCheckPort",
+        help="port to run healthcheck on",
+        type=int,
+        default=0,
+    )
+
+    parser.add_argument(
+        "--overwrite",
+        help="overwrite current crawl data: if set, existing collection directory "
+        "will be deleted before crawl is started",
         action="store_true",
         default=False,
     )
@@ -182,15 +287,6 @@ def zimit(args=None):
 
     parser.add_argument("--adminEmail", help="Admin Email for Zimit crawler")
 
-    parser.add_argument(
-        "--mobileDevice", help="Crawl as Mobile Device", nargs="?", const="iPhone X"
-    )
-
-    parser.add_argument(
-        "--useSitemap",
-        help="If set, use the URL as sitemap to get additional URLs for the crawl (usually /sitemap.xml)",
-    )
-
     parser.add_argument(
         "--custom-css",
         help="[warc2zim] Custom CSS file URL/path to inject into all articles",
@@ -211,7 +307,7 @@ def zimit(args=None):
     url = zimit_args.url
 
     if url:
-        url = check_url(url, zimit_args.scope)
+        url = check_url(url, zimit_args.scopeType)
         warc2zim_args.append("--url")
         warc2zim_args.append(url)
 
@@ -244,7 +340,7 @@ def zimit(args=None):
         cmd_args.append("--url")
         cmd_args.append(url)
 
-    user_agent_suffix = "+Zimit "
+    user_agent_suffix = zimit_args.userAgentSuffix
     if zimit_args.adminEmail:
         user_agent_suffix += zimit_args.adminEmail
 
@@ -277,9 +373,15 @@ def zimit(args=None):
         f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}"
     )
     print(f"Running browsertrix-crawler crawl: {cmd_line}", flush=True)
-    subprocess.run(cmd_args, check=True)
+    crawl = subprocess.run(cmd_args)
+    if crawl.returncode == 11:
+        print("crawl interupted by a limit")
+    elif crawl.returncode != 0:
+        raise subprocess.CalledProcessError(
+            f"returned non-zero exit status {crawl.returncode}"
+        )
 
-    warc_files = list(temp_root_dir.rglob("collections/capture-*/archive/"))[-1]
+    warc_files = list(temp_root_dir.rglob("collections/crawl-*/archive/"))[-1]
     warc2zim_args.append(str(warc_files))
 
     num_files = sum(1 for e in warc_files.iterdir())
@@ -300,23 +402,22 @@ def check_url(url, scope=None):
     actual_url = resp.url
 
     if actual_url != url:
-        # redirect on same domain or same first-level domain
-        if get_fld(url) == get_fld(actual_url):
+        if scope in (None, "any"):
             return actual_url
 
-        # is it in scope?
-        if scope:
-            try:
-                if re.match(scope, actual_url):
-                    return actual_url
-            except Exception as exc:
-                print(f"failed to parse your scope regexp for url checking: {exc}")
-
-        raise ValueError(
-            f"Main page URL ({url}) redirects to out-of-scope domain "
-            f"({actual_url}), cancelling crawl"
+        print(
+            "[WARN] Your URL ({0}) redirects to {1} which {2} on same "
+            "first-level domain. Depending on your scopeType ({3}), "
+            "your homepage might be out-of-scope. Please check!".format(
+                url,
+                actual_url,
+                "is" if get_fld(url) == get_fld(actual_url) else "is not",
+                scope,
+            )
         )
 
+        return actual_url
+
     return url
 
 
@@ -326,13 +427,26 @@ def get_node_cmd_line(args):
         "workers",
         "newContext",
         "waitUntil",
+        "urlFile",
+        "depth",
+        "extraHops",
         "limit",
         "timeout",
-        "scope",
+        "scopeType",
+        "include",
         "exclude",
-        "scroll",
+        "collection",
+        "allowHashUrls",
         "mobileDevice",
+        "userAgent",
         "useSitemap",
+        "behaviors",
+        "behaviorTimeout",
+        "profile",
+        "sizeLimit",
+        "timeLimit",
+        "healthCheckPort",
+        "overwrite",
     ]:
         value = getattr(args, arg)
         if value:

From 932f97c9994783796490d97068002dc7ba845360 Mon Sep 17 00:00:00 2001
From: renaud gaudin <reg@rskg.org>
Date: Tue, 21 Jun 2022 16:43:32 +0000
Subject: [PATCH 4/4] updated tests for crawler and warc2zim

---
 .github/workflows/ci.yaml | 2 +-
 test/integration.py       | 6 +++---
 zimit.py                  | 6 ++----
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index d148161..713f7ff 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -13,7 +13,7 @@ jobs:
         run: docker build -t zimit .
 
       - name: run crawl
-        run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice --statsFilename /output/stats.json --keep
+        run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "iPhone 11" --statsFilename /output/stats.json --keep
 
       - name: run integration test suite
         run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "pip install pytest; pytest -v ./integration.py"
diff --git a/test/integration.py b/test/integration.py
index e0cf072..eefc035 100644
--- a/test/integration.py
+++ b/test/integration.py
@@ -7,8 +7,8 @@ from warcio import ArchiveIterator
 
 
 def get_zim_article(zimfile, path):
-    zim_fh = libzim.reader.File(zimfile)
-    return zim_fh.get_article(path).content.tobytes()
+    zim_fh = libzim.reader.Archive(zimfile)
+    return zim_fh.get_entry_by_path(path).get_item().content.tobytes()
 
 
 def test_is_file():
@@ -29,7 +29,7 @@ def test_user_agent():
     """ Test that mobile user agent was used in WARC request records with custom Zimit and email suffix"""
 
     found = False
-    for warc in glob.glob("/output/.tmp*/collections/capture-*/archive/*.warc.gz"):
+    for warc in glob.glob("/output/.tmp*/collections/crawl-*/archive/*.warc.gz"):
         with open(warc, "rb") as fh:
             for record in ArchiveIterator(fh):
                 if record.rec_type == "request":
diff --git a/zimit.py b/zimit.py
index 103c8af..f0b360e 100755
--- a/zimit.py
+++ b/zimit.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env /usr/bin/python3.8
+#!/usr/bin/env python3.8
 # -*- coding: utf-8 -*-
 # vim: ai ts=4 sts=4 et sw=4 nu
 
@@ -377,9 +377,7 @@ def zimit(args=None):
     if crawl.returncode == 11:
         print("crawl interupted by a limit")
     elif crawl.returncode != 0:
-        raise subprocess.CalledProcessError(
-            f"returned non-zero exit status {crawl.returncode}"
-        )
+        raise subprocess.CalledProcessError(crawl.returncode, cmd_args)
 
     warc_files = list(temp_root_dir.rglob("collections/crawl-*/archive/"))[-1]
     warc2zim_args.append(str(warc_files))