Merge branch 'zimit2'

2025-09-24 04:30:11 -04:00 · 2024-05-24 14:07:05 +00:00 · 2024-05-24 14:07:05 +00:00 · ce49a5d4e9
commit ce49a5d4e9
parent f46f2568ff 1d54b20873
18 changed files with 753 additions and 188 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -1,2 +0,0 @@
 output/
 node_modules/
--- a/.github/workflows/Publish.yml
+++ b/.github/workflows/Publish.yml
@ -1,26 +1,20 @@
-name: Docker
+name: Publish released version
 on:
-  push:
+  release:
-    branches:
+    types: [published]
      - main
    tags:
      - v*
 jobs:
-  build-and-push:
+   publish:
    name: Deploy Docker Image
    runs-on: ubuntu-22.04
    steps:
-      - name: Retrieve source code
+      - uses: actions/checkout@v3
        uses: actions/checkout@v3
-      - name: Build and push
+      - name: Build and push Docker image
        uses: openzim/docker-publish-action@v10
        with:
          image-name: openzim/zimit
          on-master: dev
          tag-pattern: /^v([0-9.]+)$/
          latest-on-tag: true
          restrict-to: openzim/zimit
--- a/.github/workflows/PublishDockerDevImage.yaml
+++ b/.github/workflows/PublishDockerDevImage.yaml
@ -0,0 +1,30 @@
 name: Publish Docker dev image
 on:
  push:
    branches:
      - main
 jobs:
  publish:
    runs-on: ubuntu-22.04
    steps:
      - uses: actions/checkout@v3
      - name: Build and push Docker image
        uses: openzim/docker-publish-action@v10
        with:
          image-name: openzim/zimit
          manual-tag: dev
          latest-on-tag: false
          restrict-to: openzim/zimit
          registries: ghcr.io
          credentials:
            GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
            GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
          repo_description: auto
          repo_overview: auto
          platforms: |
            linux/amd64
            linux/arm64
--- a/.github/workflows/PublishDockerZimit2Image.yaml
+++ b/.github/workflows/PublishDockerZimit2Image.yaml
@ -0,0 +1,30 @@
 name: Publish Docker zimit2 image
 on:
  push:
    branches:
      - zimit2
 jobs:
  publish:
    runs-on: ubuntu-22.04
    steps:
      - uses: actions/checkout@v3
      - name: Build and push Docker image
        uses: openzim/docker-publish-action@v10
        with:
          image-name: openzim/zimit
          manual-tag: zimit2
          latest-on-tag: false
          restrict-to: openzim/zimit
          registries: ghcr.io
          credentials:
            GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
            GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
          repo_description: auto
          repo_overview: auto
          platforms: |
            linux/amd64
            linux/arm64
--- a/.github/workflows/QA.yaml
+++ b/.github/workflows/QA.yaml
@ -0,0 +1,34 @@
 name: QA
 on:
  pull_request:
  push:
    branches:
      - main
 jobs:
  check-qa:
    runs-on: ubuntu-22.04
    steps:
      - uses: actions/checkout@v3
      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version-file: pyproject.toml
          architecture: x64
      - name: Install dependencies (and project)
        run: |
          pip install -U pip
          pip install -e .[lint,scripts,test,check]
      - name: Check black formatting
        run: inv lint-black
      - name: Check ruff
        run: inv lint-ruff
      - name: Check pyright
        run: inv check-pyright
--- a/.github/workflows/Tests.yaml
+++ b/.github/workflows/Tests.yaml
@ -0,0 +1,66 @@
 name: Tests
 on:
  pull_request:
  push:
    branches:
      - main
 jobs:
  run-tests:
    runs-on: ubuntu-22.04
    steps:
      - uses: actions/checkout@v3
      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version-file: pyproject.toml
          architecture: x64
      - name: Install dependencies (and project)
        run: |
          pip install -U pip
          pip install -e .[test,scripts]
      - name: Run the tests
        run: inv coverage --args "-vvv"
      - name: Upload coverage report to codecov
        uses: codecov/codecov-action@v3
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
  build_python:
    runs-on: ubuntu-22.04
    steps:
      - uses: actions/checkout@v3
      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version-file: pyproject.toml
          architecture: x64
      - name: Ensure we can build Python targets
        run: |
          pip install -U pip build
          python3 -m build --sdist --wheel
  # this job replaces the standard "build_docker" job since it builds the docker image
  run-integration-tests:
    runs-on: ubuntu-22.04
    steps:
      - name: checkout
        uses: actions/checkout@v3
      - name: build image
        run: docker build -t zimit .
      - name: run crawl
        run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
      - name: run integration test suite
        run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py"
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@ -1,20 +0,0 @@
 name: CI
 on: push
 jobs:
  integration-tests:
    runs-on: ubuntu-22.04
    steps:
      - name: checkout
        uses: actions/checkout@v3
      - name: build image
        run: docker build -t zimit .
      - name: run crawl
        run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
      - name: run integration test suite
        run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v ./integration.py"
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,27 @@
 # See https://pre-commit.com for more information
 # See https://pre-commit.com/hooks.html for more hooks
 repos:
 - repo: https://github.com/pre-commit/pre-commit-hooks
  rev: v4.4.0
  hooks:
  -   id: trailing-whitespace
  -   id: end-of-file-fixer
 - repo: https://github.com/psf/black
  rev: "24.2.0"
  hooks:
  -   id: black
 - repo: https://github.com/astral-sh/ruff-pre-commit
  rev: v0.3.0
  hooks:
  - id: ruff
 - repo: https://github.com/RobertCraigie/pyright-python
  rev: v1.1.352
  hooks:
  - id: pyright
    name: pyright (system)
    description: 'pyright static type checker'
    entry: pyright
    language: system
    'types_or': [python, pyi]
    require_serial: true
    minimum_pre_commit_version: '2.9.2'
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,6 +7,28 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 ### Added
 - New `--version` flag to display Zimit version
 - New `--logging` flag to adjust Browsertrix Crawler logging (#273)
 - Use new `--scraper-suffix` flag of warc2zim to enhance ZIM "Scraper" metadata (#275)
 - New `--noMobileDevice` CLI argument
 ### Changed
 - Use `warc2zim` version 2, which works without Service Worker anymore
 - Using `warc2zim2` warc2zim ⚠️ change before releasing!
 - Build temporary `zimit2` Docker image for testing ⚠️ remove before releasing!
 - Adopt Python bootstrap conventions
 - Removed handling of redirects by zimit, they are handled by browsertrix crawler and detected properly by warc2zim
 - Upgrade to Python 3.12 + upgrade dependencies
 - `--userAgent` CLI argument overrides again the `--userAgentSuffix` and `--adminEmail` values
 - `--userAgent` CLI arguement is not mandatory anymore
 - Upgraded Browsertrix Crawler to 1.0.3
 ### Fixed
 - Fix support for Youtube videos (#291)
 ## [1.6.3] - 2024-01-18
--- a/50
+++ b/50
@ -1,36 +1,48 @@
-FROM webrecorder/browsertrix-crawler:0.12.4
+FROM webrecorder/browsertrix-crawler:1.1.1
 LABEL org.opencontainers.image.source https://github.com/openzim/zimit
 # add deadsnakes ppa for Python 3.12 on Ubuntu Jammy
 RUN add-apt-repository ppa:deadsnakes/ppa -y
 RUN apt-get update \
 && apt-get install -qqy --no-install-recommends \
      libmagic1 \
-        python3.10-venv \
+      python3.12-venv \
 && rm -rf /var/lib/apt/lists/* \
 # python setup (in venv not to conflict with browsertrix)
-    && python3 -m venv /app/zimit \
+ && python3.12 -m venv /app/zimit \
    && /app/zimit/bin/python -m pip install --no-cache-dir 'requests==2.31.0' 'inotify==0.2.10' 'tld==0.13' \
    'warc2zim==1.5.5' \
 # placeholder (default output location)
 && mkdir -p /output \
 # disable chrome upgrade
 && printf "repo_add_once=\"false\"\nrepo_reenable_on_distupgrade=\"false\"\n" > /etc/default/google-chrome \
 # download list of bad domains to filter-out. intentionnaly ran post-install \
 # so it's not cached in earlier layers (url stays same but content updated) \
-    mkdir -p /tmp/ads && cd /tmp/ads && \
+ && mkdir -p /tmp/ads \
-    curl -L -O https://hosts.anudeep.me/mirror/adservers.txt && \
+ && cd /tmp/ads \
-    curl -L -O https://hosts.anudeep.me/mirror/CoinMiner.txt && \
+ && curl -L -O https://hosts.anudeep.me/mirror/adservers.txt \
-    curl -L -O https://hosts.anudeep.me/mirror/facebook.txt && \
+ && curl -L -O https://hosts.anudeep.me/mirror/CoinMiner.txt \
-    cat ./*.txt > /etc/blocklist.txt \
+ && curl -L -O https://hosts.anudeep.me/mirror/facebook.txt \
 && cat ./*.txt > /etc/blocklist.txt \
 && rm ./*.txt \
-    && printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh && \
+ && printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh \
-    chmod +x /usr/local/bin/entrypoint.sh
+ && chmod +x /usr/local/bin/entrypoint.sh
-WORKDIR /app
+# Copy pyproject.toml and its dependencies
-ADD zimit.py /app/
+COPY pyproject.toml README.md /src/
-# fix shebang on zimit to use in-venv python
+COPY src/zimit/__about__.py /src/src/zimit/__about__.py
-RUN sed -i.bak "1 s/.*/#!\/app\/zimit\/bin\/python3/" /app/zimit.py \
+
-    && ln -s /app/zimit.py /usr/bin/zimit \
+# Install Python dependencies
-    && chmod +x /usr/bin/zimit
+RUN . /app/zimit/bin/activate && python -m pip install --no-cache-dir /src
 # Copy code + associated artifacts
 COPY src /src/src
 COPY *.md /src/
 # Install + cleanup
 RUN . /app/zimit/bin/activate && python -m pip install --no-cache-dir /src \
 && ln -s /app/zimit/bin/zimit /usr/bin/zimit \
 && chmod +x /usr/bin/zimit \
 && rm -rf /src
 ENTRYPOINT ["entrypoint.sh"]
-CMD ["zimit"]
+CMD ["zimit", "--help"]
--- a/README.md
+++ b/README.md
@ -3,12 +3,9 @@ Zimit
 Zimit is a scraper allowing to create ZIM file from any Web site.
 [![Docker](https://ghcr-badge.deta.dev/openzim/zimit/latest_tag?label=docker)](https://ghcr.io/openzim/zimit)
 [![Build](https://github.com/openzim/zimit/workflows/CI/badge.svg?query=branch%3Amain)](https://github.com/openzim/zimit/actions?query=branch%3Amain)
 [![CodeFactor](https://www.codefactor.io/repository/github/openzim/zimit/badge)](https://www.codefactor.io/repository/github/openzim/zimit)
 [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0)
-
+[![Docker](https://ghcr-badge.deta.dev/openzim/zimit/latest_tag?label=docker)](https://ghcr.io/openzim/zimit)
 ⚠️ **Important**: this tool uses [warc2zim](https://github.com/openzim/warc2zim) to create Zim files and thus require the Zim reader to support *Service Workers*. At the time of `zimit:1.0`, that's mostly kiwix-android and kiwix-serve. Note that service workers have protocol restrictions as well so you'll need to run it either from `localhost` or over HTTPS.
 Technical background
 --------------------
@ -68,7 +65,10 @@ default and prints the crawl status to the Docker log.
 Nota bene
 ---------
-A first version of a generic HTTP scraper was created in 2016 during
+While Zimit 1.x relied on a Service Worker to display the ZIM content, this is not anymore the case
 since Zimit 2.x which does not have any special requirements anymore.
 It should also be noted that a first version of a generic HTTP scraper was created in 2016 during
 the [Wikimania Esino Lario
 Hackathon](https://wikimania2016.wikimedia.org/wiki/Programme/Kiwix-dedicated_Hackathon).
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,224 @@
 [build-system]
 requires = ["hatchling", "hatch-openzim==0.2.0"]
 build-backend = "hatchling.build"
 [project]
 name = "zimit"
 requires-python = ">=3.12,<3.13"
 description = "Make ZIM file from any website through crawling"
 readme = "README.md"
 dependencies = [
  "requests==2.31.0",
  "inotify==0.2.10",
  "tld==0.13",
  "warc2zim @ git+https://github.com/openzim/warc2zim@warc2zim2",
 ]
 dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
 [tool.hatch.metadata.hooks.openzim-metadata]
 kind = "scraper"
 [tool.hatch.metadata]
 allow-direct-references = true  # to be removed once we use a released warc2zim version
 [project.optional-dependencies]
 scripts = [
  "invoke==2.2.0",
 ]
 lint = [
  "black==24.2.0",
  "ruff==0.3.0",
 ]
 check = [
  "pyright==1.1.352",
 ]
 test = [
  "pytest==8.0.2",
  "coverage==7.4.3",
 ]
 dev = [
  "pre-commit==3.6.2",
  "debugpy==1.8.1",
  "zimit[scripts]",
  "zimit[lint]",
  "zimit[test]",
  "zimit[check]",
 ]
 [project.scripts]
 zimit = "zimit:zimit.zimit"
 [tool.hatch.version]
 path = "src/zimit/__about__.py"
 [tool.hatch.build]
 exclude = [
  "/.github",
 ]
 [tool.hatch.build.targets.wheel]
 packages = ["src/zimit"]
 [tool.hatch.envs.default]
 features = ["dev"]
 [tool.hatch.envs.test]
 features = ["scripts", "test"]
 [tool.hatch.envs.test.scripts]
 run = "inv test --args '{args}'"
 run-cov = "inv test-cov --args '{args}'"
 report-cov = "inv report-cov"
 coverage = "inv coverage --args '{args}'"
 html = "inv coverage --html --args '{args}'"
 [tool.hatch.envs.lint]
 template = "lint"
 skip-install = false
 features = ["scripts", "lint"]
 [tool.hatch.envs.lint.scripts]
 black = "inv lint-black --args '{args}'"
 ruff = "inv lint-ruff --args '{args}'"
 all = "inv lintall --args '{args}'"
 fix-black = "inv fix-black --args '{args}'"
 fix-ruff = "inv fix-ruff --args '{args}'"
 fixall = "inv fixall --args '{args}'"
 [tool.hatch.envs.check]
 features = ["scripts", "check"]
 [tool.hatch.envs.check.scripts]
 pyright = "inv check-pyright --args '{args}'"
 all = "inv checkall --args '{args}'"
 [tool.black]
 line-length = 88
 target-version = ['py312']
 [tool.ruff]
 target-version = "py312"
 line-length = 88
 src = ["src"]
 [tool.ruff.lint]
 select = [
  "A",  # flake8-builtins
  # "ANN",  # flake8-annotations
  "ARG",  # flake8-unused-arguments
  # "ASYNC",  # flake8-async
  "B",  # flake8-bugbear
  # "BLE",  # flake8-blind-except
  "C4",  # flake8-comprehensions
  "C90",  # mccabe
  # "COM",  # flake8-commas
  # "D",  # pydocstyle
  # "DJ",  # flake8-django
  "DTZ",  # flake8-datetimez
  "E",  # pycodestyle (default)
  "EM",  # flake8-errmsg
  # "ERA",  # eradicate
  # "EXE",  # flake8-executable
  "F",  # Pyflakes (default)
  # "FA",  # flake8-future-annotations
  "FBT",  # flake8-boolean-trap
  # "FLY",  # flynt
  # "G",  # flake8-logging-format
  "I",  # isort
  "ICN",  # flake8-import-conventions
  # "INP",  # flake8-no-pep420
  # "INT",  # flake8-gettext
  "ISC",  # flake8-implicit-str-concat
  "N",  # pep8-naming
  # "NPY",  # NumPy-specific rules
  # "PD",  # pandas-vet
  # "PGH",  # pygrep-hooks
  # "PIE",  # flake8-pie
  # "PL",  # Pylint
  "PLC",  # Pylint: Convention
  "PLE",  # Pylint: Error
  "PLR",  # Pylint: Refactor
  "PLW",  # Pylint: Warning
  # "PT",  # flake8-pytest-style
  # "PTH",  # flake8-use-pathlib
  # "PYI",  # flake8-pyi
  "Q",  # flake8-quotes
  # "RET",  # flake8-return
  # "RSE",  # flake8-raise
  "RUF",  # Ruff-specific rules
  "S",  # flake8-bandit
  # "SIM",  # flake8-simplify
  # "SLF",  # flake8-self
  "T10",  # flake8-debugger
  "T20",  # flake8-print
  # "TCH",  # flake8-type-checking
  # "TD",  # flake8-todos
  "TID",  # flake8-tidy-imports
  # "TRY",  # tryceratops
  "UP",  # pyupgrade
  "W",  # pycodestyle
  "YTT",  # flake8-2020
 ]
 ignore = [
  # Allow non-abstract empty methods in abstract base classes
  "B027",
  # Remove flake8-errmsg since we consider they bloat the code and provide limited value
  "EM",
  # Allow boolean positional values in function calls, like `dict.get(... True)`
  "FBT003",
  # Ignore checks for possible passwords
  "S105", "S106", "S107",
  # Ignore warnings on subprocess.run / popen
  "S603",
  # Ignore complexity
  "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915",
 ]
 unfixable = [
  # Don't touch unused imports
  "F401",
 ]
 [tool.ruff.lint.isort]
 known-first-party = ["zimit"]
 [tool.ruff.lint.flake8-bugbear]
 # add exceptions to B008 for fastapi.
 extend-immutable-calls = ["fastapi.Depends", "fastapi.Query"]
 [tool.ruff.lint.flake8-tidy-imports]
 ban-relative-imports = "all"
 [tool.ruff.lint.per-file-ignores]
 # Tests can use magic values, assertions, and relative imports
 "tests**/**/*" = ["PLR2004", "S101", "TID252"]
 [tool.pytest.ini_options]
 minversion = "7.3"
 testpaths = ["tests"]
 pythonpath = [".", "src"]
 [tool.coverage.paths]
 zimit = ["src/zimit"]
 tests = ["tests"]
 [tool.coverage.run]
 source_pkgs = ["zimit"]
 branch = true
 parallel = true
 omit = [
  "src/zimit/__about__.py",
 ]
 [tool.coverage.report]
 exclude_lines = [
  "no cov",
  "if __name__ == .__main__.:",
  "if TYPE_CHECKING:",
 ]
 [tool.pyright]
 include = ["src", "tests", "tasks.py"]
 exclude = [".env/**", ".venv/**"]
 extraPaths = ["src"]
 pythonVersion = "3.12"
 typeCheckingMode="basic"
--- a/src/zimit/about.py
+++ b/src/zimit/about.py
@ -0,0 +1 @@
 __version__ = "2.0.0-dev4"
--- a/src/zimit/zimit.py
+++ b/src/zimit/zimit.py
@ -1,7 +1,3 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 # vim: ai ts=4 sts=4 et sw=4 nu
 """
 Main zimit run script
 This script validates arguments with warc2zim, checks permissions
@ -11,6 +7,7 @@ and then calls the Node based driver
 import atexit
 import itertools
 import json
 import logging
 import shutil
 import signal
 import subprocess
@ -23,19 +20,24 @@ from pathlib import Path
 import inotify
 import inotify.adapters
 import requests
 from tld import get_fld
 from warc2zim.main import main as warc2zim
 from zimscraperlib.logging import getLogger
 from zimscraperlib.uri import rebuild_uri
-DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15"
+from zimit.__about__ import __version__
 EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
 EXIT_CODE_CRAWLER_LIMIT_HIT = 11
 NORMAL_WARC2ZIM_EXIT_CODE = 100
 logger = getLogger(name="zimit", level=logging.INFO)
 class ProgressFileWatcher:
-    def __init__(self, output_dir, stats_path):
+    def __init__(self, output_dir: Path, stats_path: Path):
        self.crawl_path = output_dir / "crawl.json"
        self.warc2zim_path = output_dir / "warc2zim.json"
-        self.stats_path = Path(stats_path)
+        self.stats_path = stats_path
        if not self.stats_path.is_absolute():
            self.stats_path = output_dir / self.stats_path
@ -46,6 +48,8 @@ class ProgressFileWatcher:
        self.process = None
    def stop(self):
        if not self.process:
            return
        self.process.join(0.1)
        self.process.terminate()
@ -58,10 +62,10 @@ class ProgressFileWatcher:
        self.process.start()
    @staticmethod
-    def inotify_watcher(crawl_fpath, warc2zim_fpath, output_fpath):
+    def inotify_watcher(crawl_fpath: str, warc2zim_fpath: str, output_fpath: str):
        ino = inotify.adapters.Inotify()
-        ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY)
+        ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY)  # pyright: ignore
-        ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY)
+        ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY)  # pyright: ignore
        class Limit:
            def __init__(self):
@ -97,15 +101,15 @@ class ProgressFileWatcher:
                "limit": limit.as_dict,
            }
-        for _, _, fpath, _ in ino.event_gen(yield_nones=False):
+        for _, _, fpath, _ in ino.event_gen(yield_nones=False):  # pyright: ignore
            func = {crawl_fpath: crawl_conv, warc2zim_fpath: warc2zim_conv}.get(fpath)
            if not func:
                continue
            # open input and output separatly as to not clear output on error
-            with open(fpath, "r") as ifh:
+            with open(fpath) as ifh:
                try:
                    out = func(json.load(ifh), limit)
-                except Exception:  # nosec
+                except Exception:  # nosec # noqa: S112
                    # simply ignore progress update should an error arise
                    # might be malformed input for instance
                    continue
@ -115,7 +119,7 @@ class ProgressFileWatcher:
                    json.dump(out, ofh)
-def zimit(args=None):
+def run(raw_args):
    wait_until_options = ["load", "domcontentloaded", "networkidle"]
    wait_until_all = wait_until_options + [
        f"{a},{b}" for a, b in itertools.combinations(wait_until_options, 2)
@ -131,7 +135,7 @@ def zimit(args=None):
    parser.add_argument(
        "--urlFile",
-        help="If set, read a list of seed urls, " "one per line, from the specified",
+        help="If set, read a list of seed urls, one per line, from the specified",
    )
    parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers")
@ -205,7 +209,8 @@ def zimit(args=None):
    parser.add_argument(
        "--lang",
-        help="if set, sets the language used by the browser, should be ISO 639 language[-country] code",
+        help="if set, sets the language used by the browser, should be ISO 639 "
        "language[-country] code",
    )
    parser.add_argument(
@ -220,12 +225,21 @@ def zimit(args=None):
        help="Emulate mobile device by name from "
        "https://github.com/puppeteer/puppeteer/blob/"
        "main/packages/puppeteer-core/src/common/Device.ts",
        default="Pixel 2",
    )
    parser.add_argument(
        "--noMobileDevice",
        help="Do not emulate a mobile device (use at your own risk, behavior is"
        "uncertain)",
        action="store_true",
        default=False,
    )
    parser.add_argument(
        "--userAgent",
-        help="Override default user-agent with specified value ; --userAgentSuffix is still applied",
+        help="Override default user-agent with specified value ; --userAgentSuffix and "
-        default=DEFAULT_USER_AGENT,
+        "--adminEmail have no effect when this is set",
    )
    parser.add_argument(
@ -333,7 +347,38 @@ def zimit(args=None):
        "to configure the crawling behaviour if not set via argument.",
    )
-    zimit_args, warc2zim_args = parser.parse_known_args(args)
+    parser.add_argument(
        "--version",
        help="Display scraper version and exit",
        action="version",
        version=f"Zimit {__version__}",
    )
    parser.add_argument(
        "--logging",
        help="Crawler logging configuration",
    )
    zimit_args, warc2zim_args = parser.parse_known_args(raw_args)
    logger.info("Checking browsertrix-crawler version")
    crawl_version_cmd = ["crawl", "--version"]
    try:
        crawl = subprocess.run(
            crawl_version_cmd, check=True, capture_output=True, text=True
        )
    except Exception:
        logger.error("Failed to get Browsertrix crawler version")
        raise
    crawler_version = crawl.stdout.strip()
    logger.info(f"Browsertrix crawler: version {crawler_version}")
    # pass a scraper suffix to warc2zim so that both zimit, warc2zim and crawler
    # versions are associated with the ZIM
    warc2zim_args.append("--scraper-suffix")
    warc2zim_args.append(
        f" + zimit {__version__} + Browsertrix crawler {crawler_version}"
    )
    # pass url and output to warc2zim also
    if zimit_args.output:
@ -342,14 +387,12 @@ def zimit(args=None):
    url = zimit_args.url
-    user_agent = zimit_args.userAgent
+    user_agent_suffix = zimit_args.userAgentSuffix
    if zimit_args.userAgentSuffix:
        user_agent += f" {zimit_args.userAgentSuffix}"
    if zimit_args.adminEmail:
-        user_agent += f" {zimit_args.adminEmail}"
+        user_agent_suffix += f" {zimit_args.adminEmail}"
    if url:
-        url = check_url(url, user_agent, zimit_args.scopeType)
+        url = get_cleaned_url(url)
        warc2zim_args.append("--url")
        warc2zim_args.append(url)
@ -372,13 +415,13 @@ def zimit(args=None):
        warc2zim_args.append("--lang")
        warc2zim_args.append(zimit_args.zim_lang)
-    print("----------")
+    logger.info("----------")
-    print("Testing warc2zim args")
+    logger.info("Testing warc2zim args")
-    print("Running: warc2zim " + " ".join(warc2zim_args), flush=True)
+    logger.info("Running: warc2zim " + " ".join(warc2zim_args))
    res = warc2zim(warc2zim_args)
-    if res != 100:
+    if res != NORMAL_WARC2ZIM_EXIT_CODE:
-        print("Exiting, invalid warc2zim params")
+        logger.info("Exiting, invalid warc2zim params")
-        return 2
+        return EXIT_CODE_WARC2ZIM_CHECK_FAILED
    # make temp dir for this crawl
    if zimit_args.build:
@ -389,9 +432,9 @@ def zimit(args=None):
    if not zimit_args.keep:
        def cleanup():
-            print("")
+            logger.info("")
-            print("----------")
+            logger.info("----------")
-            print(f"Cleanup, removing temp dir: {temp_root_dir}", flush=True)
+            logger.info(f"Cleanup, removing temp dir: {temp_root_dir}")
            shutil.rmtree(temp_root_dir)
        atexit.register(cleanup)
@ -401,8 +444,12 @@ def zimit(args=None):
        cmd_args.append("--url")
        cmd_args.append(url)
-    cmd_args.append("--userAgent")
+    cmd_args.append("--userAgentSuffix")
-    cmd_args.append(user_agent)
+    cmd_args.append(user_agent_suffix)
    if not zimit_args.noMobileDevice:
        cmd_args.append("--mobileDevice")
        cmd_args.append(zimit_args.mobileDevice)
    cmd_args.append("--cwd")
    cmd_args.append(str(temp_root_dir))
@ -412,7 +459,7 @@ def zimit(args=None):
        watcher = ProgressFileWatcher(
            Path(zimit_args.output), Path(zimit_args.statsFilename)
        )
-        print(f"Writing progress to {watcher.stats_path}")
+        logger.info(f"Writing progress to {watcher.stats_path}")
        # update crawler command
        cmd_args.append("--statsFilename")
        cmd_args.append(str(watcher.crawl_path))
@ -424,15 +471,16 @@ def zimit(args=None):
    cmd_line = " ".join(cmd_args)
-    print("")
+    logger.info("")
-    print("----------")
+    logger.info("----------")
-    print(
+    logger.info(
-        f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}"
+        f"Output to tempdir: {temp_root_dir} - "
        f"{'will keep' if zimit_args.keep else 'will delete'}"
    )
-    print(f"Running browsertrix-crawler crawl: {cmd_line}", flush=True)
+    logger.info(f"Running browsertrix-crawler crawl: {cmd_line}")
-    crawl = subprocess.run(cmd_args)
+    crawl = subprocess.run(cmd_args, check=False)
-    if crawl.returncode == 11:
+    if crawl.returncode == EXIT_CODE_CRAWLER_LIMIT_HIT:
-        print("crawl interupted by a limit")
+        logger.info("crawl interupted by a limit")
    elif crawl.returncode != 0:
        raise subprocess.CalledProcessError(crawl.returncode, cmd_args)
@ -447,65 +495,33 @@ def zimit(args=None):
                "Failed to find directory where WARC files have been created"
            )
        elif len(warc_dirs) > 1:
-            print("Found many WARC files directories, only last one will be used")
+            logger.info("Found many WARC files directories, only last one will be used")
            for directory in warc_dirs:
-                print(f"- {directory}")
+                logger.info(f"- {directory}")
        warc_directory = warc_dirs[-1]
-    print("")
+    logger.info("")
-    print("----------")
+    logger.info("----------")
-    print(f"Processing WARC files in {warc_directory}")
+    logger.info(f"Processing WARC files in {warc_directory}")
    warc2zim_args.append(str(warc_directory))
    num_files = sum(1 for _ in warc_directory.iterdir())
-    print(f"{num_files} WARC files found", flush=True)
+    logger.info(f"{num_files} WARC files found")
-    print(f"Calling warc2zim with these args: {warc2zim_args}", flush=True)
+    logger.info(f"Calling warc2zim with these args: {warc2zim_args}")
    return warc2zim(warc2zim_args)
-def check_url(url, user_agent, scope=None):
+def get_cleaned_url(url: str):
-    url = urllib.parse.urlparse(url)
+    parsed_url = urllib.parse.urlparse(url)
    try:
        with requests.get(
            url.geturl(),
            stream=True,
            allow_redirects=True,
            timeout=(12.2, 27),
            headers={"User-Agent": user_agent},
        ) as resp:
            resp.raise_for_status()
    except requests.exceptions.RequestException as exc:
        print(f"failed to connect to {url.geturl()}: {exc}", flush=True)
        raise SystemExit(1)
    actual_url = urllib.parse.urlparse(resp.url)
    # remove explicit port in URI for default-for-scheme as browsers does it
-    if actual_url.scheme == "https" and actual_url.port == 443:
+    if parsed_url.scheme == "https" and parsed_url.port == 443:  # noqa: PLR2004
-        actual_url = rebuild_uri(actual_url, port="")
+        parsed_url = rebuild_uri(parsed_url, port="")
-    if actual_url.scheme == "http" and actual_url.port == 80:
+    if parsed_url.scheme == "http" and parsed_url.port == 80:  # noqa: PLR2004
-        actual_url = rebuild_uri(actual_url, port="")
+        parsed_url = rebuild_uri(parsed_url, port="")
-    if actual_url.geturl() != url.geturl():
+    return parsed_url.geturl()
        if scope in (None, "any"):
            return actual_url.geturl()
        print(
            "[WARN] Your URL ({0}) redirects to {1} which {2} on same "
            "first-level domain. Depending on your scopeType ({3}), "
            "your homepage might be out-of-scope. Please check!".format(
                url.geturl(),
                actual_url.geturl(),
                "is"
                if get_fld(url.geturl()) == get_fld(actual_url.geturl())
                else "is not",
                scope,
            )
        )
        return actual_url.geturl()
    return url.geturl()
 def get_node_cmd_line(args):
@ -527,7 +543,7 @@ def get_node_cmd_line(args):
        "collection",
        "allowHashUrls",
        "lang",
-        "mobileDevice",
+        "userAgent",
        "useSitemap",
        "behaviors",
        "behaviorTimeout",
@ -539,9 +555,10 @@ def get_node_cmd_line(args):
        "healthCheckPort",
        "overwrite",
        "config",
        "logging",
    ]:
        value = getattr(args, arg)
-        if value == None or (isinstance(value, bool) and value == False):
+        if value is None or (isinstance(value, bool) and value is False):
            continue
        node_cmd.append("--" + arg)
        if not isinstance(value, bool):
@ -550,17 +567,22 @@ def get_node_cmd_line(args):
    return node_cmd
-def sigint_handler(*args):
+def sigint_handler(*args):  # noqa: ARG001
-    print("")
+    logger.info("")
-    print("")
+    logger.info("")
-    print("SIGINT/SIGTERM received, stopping zimit")
+    logger.info("SIGINT/SIGTERM received, stopping zimit")
-    print("")
+    logger.info("")
-    print("", flush=True)
+    logger.info("")
    sys.exit(3)
 def zimit():
    run(sys.argv[1:])
 signal.signal(signal.SIGINT, sigint_handler)
 signal.signal(signal.SIGTERM, sigint_handler)
 if __name__ == "__main__":
    zimit()
--- a/tasks.py
+++ b/tasks.py
@ -0,0 +1,109 @@
 # pyright: strict, reportUntypedFunctionDecorator=false
 import os
 from invoke.context import Context
 from invoke.tasks import task  # pyright: ignore [reportUnknownVariableType]
 use_pty = not os.getenv("CI", "")
@task(optional=["args"], help={"args": "pytest additional arguments"})
 def test(ctx: Context, args: str = ""):
    """run tests (without coverage)"""
    ctx.run(f"pytest {args}", pty=use_pty)
@task(optional=["args"], help={"args": "pytest additional arguments"})
 def test_cov(ctx: Context, args: str = ""):
    """run test vith coverage"""
    ctx.run(f"coverage run -m pytest {args}", pty=use_pty)
@task(optional=["html"], help={"html": "flag to export html report"})
 def report_cov(ctx: Context, *, html: bool = False):
    """report coverage"""
    ctx.run("coverage combine", warn=True, pty=use_pty)
    ctx.run("coverage report --show-missing", pty=use_pty)
    if html:
        ctx.run("coverage html", pty=use_pty)
@task(
    optional=["args", "html"],
    help={
        "args": "pytest additional arguments",
        "html": "flag to export html report",
    },
 )
 def coverage(ctx: Context, args: str = "", *, html: bool = False):
    """run tests and report coverage"""
    test_cov(ctx, args=args)
    report_cov(ctx, html=html)
@task(optional=["args"], help={"args": "black additional arguments"})
 def lint_black(ctx: Context, args: str = "."):
    args = args or "."  # needed for hatch script
    ctx.run("black --version", pty=use_pty)
    ctx.run(f"black --check --diff {args}", pty=use_pty)
@task(optional=["args"], help={"args": "ruff additional arguments"})
 def lint_ruff(ctx: Context, args: str = "."):
    args = args or "."  # needed for hatch script
    ctx.run("ruff --version", pty=use_pty)
    ctx.run(f"ruff check {args}", pty=use_pty)
@task(
    optional=["args"],
    help={
        "args": "linting tools (black, ruff) additional arguments, typically a path",
    },
 )
 def lintall(ctx: Context, args: str = "."):
    """Check linting"""
    args = args or "."  # needed for hatch script
    lint_black(ctx, args)
    lint_ruff(ctx, args)
@task(optional=["args"], help={"args": "check tools (pyright) additional arguments"})
 def check_pyright(ctx: Context, args: str = ""):
    """check static types with pyright"""
    ctx.run("pyright --version")
    ctx.run(f"pyright {args}", pty=use_pty)
@task(optional=["args"], help={"args": "check tools (pyright) additional arguments"})
 def checkall(ctx: Context, args: str = ""):
    """check static types"""
    check_pyright(ctx, args)
@task(optional=["args"], help={"args": "black additional arguments"})
 def fix_black(ctx: Context, args: str = "."):
    """fix black formatting"""
    args = args or "."  # needed for hatch script
    ctx.run(f"black {args}", pty=use_pty)
@task(optional=["args"], help={"args": "ruff additional arguments"})
 def fix_ruff(ctx: Context, args: str = "."):
    """fix all ruff rules"""
    args = args or "."  # needed for hatch script
    ctx.run(f"ruff check --fix {args}", pty=use_pty)
@task(
    optional=["args"],
    help={
        "args": "linting tools (black, ruff) additional arguments, typically a path",
    },
 )
 def fixall(ctx: Context, args: str = "."):
    """Fix everything automatically"""
    args = args or "."  # needed for hatch script
    fix_black(ctx, args)
    fix_ruff(ctx, args)
    lintall(ctx, args)
--- a/tests-integration/README.md
+++ b/tests-integration/README.md
@ -0,0 +1 @@
 These are integration tests, meant to be ran inside the CI (because we need to first perform a zimit run on a given website and then check its output)
--- a/tests-integration/integration.py
+++ b/tests-integration/integration.py
@ -1,14 +1,9 @@
 import os
 import glob
 import json
 import os
 import libzim.reader
 from warcio import ArchiveIterator
-
+from zimscraperlib.zim import Archive
 def get_zim_article(zimfile, path):
    zim_fh = libzim.reader.Archive(zimfile)
    return zim_fh.get_entry_by_path(path).get_item().content.tobytes()
 def test_is_file():
@ -20,20 +15,34 @@ def test_zim_main_page():
    """Main page specified, http://isago.rskg.org/, was a redirect to https
    Ensure main page is the redirected page"""
-    assert b'"https://isago.rskg.org/"' in get_zim_article(
+    main_entry = Archive("/output/isago.zim").main_entry
-        "/output/isago.zim", "A/index.html"
+    assert main_entry.is_redirect
-    )
+    assert main_entry.get_redirect_entry().path == "isago.rskg.org/"
 def test_zim_scraper():
    """Main page specified, http://isago.rskg.org/, was a redirect to https
    Ensure main page is the redirected page"""
    zim_fh = Archive("/output/isago.zim")
    scraper = zim_fh.get_text_metadata("Scraper")
    assert "zimit " in scraper
    assert "warc2zim " in scraper
    assert "Browsertrix crawler " in scraper
 def test_user_agent():
-    """Test that mobile user agent was used in WARC request records with custom Zimit and email suffix"""
+    """Test that mobile user agent was used
    Check is done in WARC request records with custom Zimit and email suffix
    """
    found = False
    for warc in glob.glob("/output/.tmp*/collections/crawl-*/archive/*.warc.gz"):
        with open(warc, "rb") as fh:
            for record in ArchiveIterator(fh):
                if record.rec_type == "request":
-                    print(record.http_headers)
+                    print(record.http_headers)  # noqa: T201
                    ua = record.http_headers.get_header("User-Agent")
                    if ua:
                        assert "Mozilla" in ua
@ -56,12 +65,12 @@ def test_stats_output():
        }
    with open("/output/warc2zim.json") as fh:
        assert json.loads(fh.read()) == {
-            "written": 8,
+            "written": 7,
-            "total": 8,
+            "total": 7,
        }
    with open("/output/stats.json") as fh:
        assert json.loads(fh.read()) == {
-            "done": 8,
+            "done": 7,
-            "total": 8,
+            "total": 7,
            "limit": {"max": 0, "hit": False},
        }
--- a/tests/test_dummy.py
+++ b/tests/test_dummy.py
@ -0,0 +1,6 @@
 from zimit.zimit import NORMAL_WARC2ZIM_EXIT_CODE
 # dummy test, just to have coverage report done
 def test_something_exists():
    assert NORMAL_WARC2ZIM_EXIT_CODE
		`@ -0,0 +1 @@`
							`These are integration tests, meant to be ran inside the CI (because we need to first perform a zimit run on a given website and then check its output)`