Merge branch 'zimit2'

This commit is contained in:
benoit74 2024-05-24 14:07:05 +00:00
commit ce49a5d4e9
No known key found for this signature in database
GPG Key ID: B89606434FC7B530
18 changed files with 753 additions and 188 deletions

View File

@ -1,2 +0,0 @@
output/
node_modules/

View File

@ -1,26 +1,20 @@
name: Docker name: Publish released version
on: on:
push: release:
branches: types: [published]
- main
tags:
- v*
jobs: jobs:
build-and-push: publish:
name: Deploy Docker Image
runs-on: ubuntu-22.04 runs-on: ubuntu-22.04
steps: steps:
- name: Retrieve source code - uses: actions/checkout@v3
uses: actions/checkout@v3
- name: Build and push - name: Build and push Docker image
uses: openzim/docker-publish-action@v10 uses: openzim/docker-publish-action@v10
with: with:
image-name: openzim/zimit image-name: openzim/zimit
on-master: dev
tag-pattern: /^v([0-9.]+)$/ tag-pattern: /^v([0-9.]+)$/
latest-on-tag: true latest-on-tag: true
restrict-to: openzim/zimit restrict-to: openzim/zimit

View File

@ -0,0 +1,30 @@
name: Publish Docker dev image
on:
push:
branches:
- main
jobs:
publish:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
- name: Build and push Docker image
uses: openzim/docker-publish-action@v10
with:
image-name: openzim/zimit
manual-tag: dev
latest-on-tag: false
restrict-to: openzim/zimit
registries: ghcr.io
credentials:
GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
repo_description: auto
repo_overview: auto
platforms: |
linux/amd64
linux/arm64

View File

@ -0,0 +1,30 @@
name: Publish Docker zimit2 image
on:
push:
branches:
- zimit2
jobs:
publish:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
- name: Build and push Docker image
uses: openzim/docker-publish-action@v10
with:
image-name: openzim/zimit
manual-tag: zimit2
latest-on-tag: false
restrict-to: openzim/zimit
registries: ghcr.io
credentials:
GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
repo_description: auto
repo_overview: auto
platforms: |
linux/amd64
linux/arm64

34
.github/workflows/QA.yaml vendored Normal file
View File

@ -0,0 +1,34 @@
name: QA
on:
pull_request:
push:
branches:
- main
jobs:
check-qa:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version-file: pyproject.toml
architecture: x64
- name: Install dependencies (and project)
run: |
pip install -U pip
pip install -e .[lint,scripts,test,check]
- name: Check black formatting
run: inv lint-black
- name: Check ruff
run: inv lint-ruff
- name: Check pyright
run: inv check-pyright

66
.github/workflows/Tests.yaml vendored Normal file
View File

@ -0,0 +1,66 @@
name: Tests
on:
pull_request:
push:
branches:
- main
jobs:
run-tests:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version-file: pyproject.toml
architecture: x64
- name: Install dependencies (and project)
run: |
pip install -U pip
pip install -e .[test,scripts]
- name: Run the tests
run: inv coverage --args "-vvv"
- name: Upload coverage report to codecov
uses: codecov/codecov-action@v3
with:
token: ${{ secrets.CODECOV_TOKEN }}
build_python:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version-file: pyproject.toml
architecture: x64
- name: Ensure we can build Python targets
run: |
pip install -U pip build
python3 -m build --sdist --wheel
# this job replaces the standard "build_docker" job since it builds the docker image
run-integration-tests:
runs-on: ubuntu-22.04
steps:
- name: checkout
uses: actions/checkout@v3
- name: build image
run: docker build -t zimit .
- name: run crawl
run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
- name: run integration test suite
run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py"

View File

@ -1,20 +0,0 @@
name: CI
on: push
jobs:
integration-tests:
runs-on: ubuntu-22.04
steps:
- name: checkout
uses: actions/checkout@v3
- name: build image
run: docker build -t zimit .
- name: run crawl
run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
- name: run integration test suite
run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v ./integration.py"

27
.pre-commit-config.yaml Normal file
View File

@ -0,0 +1,27 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- repo: https://github.com/psf/black
rev: "24.2.0"
hooks:
- id: black
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.3.0
hooks:
- id: ruff
- repo: https://github.com/RobertCraigie/pyright-python
rev: v1.1.352
hooks:
- id: pyright
name: pyright (system)
description: 'pyright static type checker'
entry: pyright
language: system
'types_or': [python, pyi]
require_serial: true
minimum_pre_commit_version: '2.9.2'

View File

@ -7,6 +7,28 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased] ## [Unreleased]
### Added
- New `--version` flag to display Zimit version
- New `--logging` flag to adjust Browsertrix Crawler logging (#273)
- Use new `--scraper-suffix` flag of warc2zim to enhance ZIM "Scraper" metadata (#275)
- New `--noMobileDevice` CLI argument
### Changed
- Use `warc2zim` version 2, which works without Service Worker anymore
- Using `warc2zim2` warc2zim ⚠️ change before releasing!
- Build temporary `zimit2` Docker image for testing ⚠️ remove before releasing!
- Adopt Python bootstrap conventions
- Removed handling of redirects by zimit, they are handled by browsertrix crawler and detected properly by warc2zim
- Upgrade to Python 3.12 + upgrade dependencies
- `--userAgent` CLI argument overrides again the `--userAgentSuffix` and `--adminEmail` values
- `--userAgent` CLI arguement is not mandatory anymore
- Upgraded Browsertrix Crawler to 1.0.3
### Fixed
- Fix support for Youtube videos (#291)
## [1.6.3] - 2024-01-18 ## [1.6.3] - 2024-01-18

View File

@ -1,36 +1,48 @@
FROM webrecorder/browsertrix-crawler:0.12.4 FROM webrecorder/browsertrix-crawler:1.1.1
LABEL org.opencontainers.image.source https://github.com/openzim/zimit LABEL org.opencontainers.image.source https://github.com/openzim/zimit
# add deadsnakes ppa for Python 3.12 on Ubuntu Jammy
RUN add-apt-repository ppa:deadsnakes/ppa -y
RUN apt-get update \ RUN apt-get update \
&& apt-get install -qqy --no-install-recommends \ && apt-get install -qqy --no-install-recommends \
libmagic1 \ libmagic1 \
python3.10-venv \ python3.12-venv \
&& rm -rf /var/lib/apt/lists/* \ && rm -rf /var/lib/apt/lists/* \
# python setup (in venv not to conflict with browsertrix) # python setup (in venv not to conflict with browsertrix)
&& python3 -m venv /app/zimit \ && python3.12 -m venv /app/zimit \
&& /app/zimit/bin/python -m pip install --no-cache-dir 'requests==2.31.0' 'inotify==0.2.10' 'tld==0.13' \
'warc2zim==1.5.5' \
# placeholder (default output location) # placeholder (default output location)
&& mkdir -p /output \ && mkdir -p /output \
# disable chrome upgrade # disable chrome upgrade
&& printf "repo_add_once=\"false\"\nrepo_reenable_on_distupgrade=\"false\"\n" > /etc/default/google-chrome \ && printf "repo_add_once=\"false\"\nrepo_reenable_on_distupgrade=\"false\"\n" > /etc/default/google-chrome \
# download list of bad domains to filter-out. intentionnaly ran post-install \ # download list of bad domains to filter-out. intentionnaly ran post-install \
# so it's not cached in earlier layers (url stays same but content updated) \ # so it's not cached in earlier layers (url stays same but content updated) \
mkdir -p /tmp/ads && cd /tmp/ads && \ && mkdir -p /tmp/ads \
curl -L -O https://hosts.anudeep.me/mirror/adservers.txt && \ && cd /tmp/ads \
curl -L -O https://hosts.anudeep.me/mirror/CoinMiner.txt && \ && curl -L -O https://hosts.anudeep.me/mirror/adservers.txt \
curl -L -O https://hosts.anudeep.me/mirror/facebook.txt && \ && curl -L -O https://hosts.anudeep.me/mirror/CoinMiner.txt \
cat ./*.txt > /etc/blocklist.txt \ && curl -L -O https://hosts.anudeep.me/mirror/facebook.txt \
&& cat ./*.txt > /etc/blocklist.txt \
&& rm ./*.txt \ && rm ./*.txt \
&& printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh && \ && printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh \
chmod +x /usr/local/bin/entrypoint.sh && chmod +x /usr/local/bin/entrypoint.sh
WORKDIR /app # Copy pyproject.toml and its dependencies
ADD zimit.py /app/ COPY pyproject.toml README.md /src/
# fix shebang on zimit to use in-venv python COPY src/zimit/__about__.py /src/src/zimit/__about__.py
RUN sed -i.bak "1 s/.*/#!\/app\/zimit\/bin\/python3/" /app/zimit.py \
&& ln -s /app/zimit.py /usr/bin/zimit \ # Install Python dependencies
&& chmod +x /usr/bin/zimit RUN . /app/zimit/bin/activate && python -m pip install --no-cache-dir /src
# Copy code + associated artifacts
COPY src /src/src
COPY *.md /src/
# Install + cleanup
RUN . /app/zimit/bin/activate && python -m pip install --no-cache-dir /src \
&& ln -s /app/zimit/bin/zimit /usr/bin/zimit \
&& chmod +x /usr/bin/zimit \
&& rm -rf /src
ENTRYPOINT ["entrypoint.sh"] ENTRYPOINT ["entrypoint.sh"]
CMD ["zimit"] CMD ["zimit", "--help"]

View File

@ -3,12 +3,9 @@ Zimit
Zimit is a scraper allowing to create ZIM file from any Web site. Zimit is a scraper allowing to create ZIM file from any Web site.
[![Docker](https://ghcr-badge.deta.dev/openzim/zimit/latest_tag?label=docker)](https://ghcr.io/openzim/zimit)
[![Build](https://github.com/openzim/zimit/workflows/CI/badge.svg?query=branch%3Amain)](https://github.com/openzim/zimit/actions?query=branch%3Amain)
[![CodeFactor](https://www.codefactor.io/repository/github/openzim/zimit/badge)](https://www.codefactor.io/repository/github/openzim/zimit) [![CodeFactor](https://www.codefactor.io/repository/github/openzim/zimit/badge)](https://www.codefactor.io/repository/github/openzim/zimit)
[![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0)
[![Docker](https://ghcr-badge.deta.dev/openzim/zimit/latest_tag?label=docker)](https://ghcr.io/openzim/zimit)
⚠️ **Important**: this tool uses [warc2zim](https://github.com/openzim/warc2zim) to create Zim files and thus require the Zim reader to support *Service Workers*. At the time of `zimit:1.0`, that's mostly kiwix-android and kiwix-serve. Note that service workers have protocol restrictions as well so you'll need to run it either from `localhost` or over HTTPS.
Technical background Technical background
-------------------- --------------------
@ -68,7 +65,10 @@ default and prints the crawl status to the Docker log.
Nota bene Nota bene
--------- ---------
A first version of a generic HTTP scraper was created in 2016 during While Zimit 1.x relied on a Service Worker to display the ZIM content, this is not anymore the case
since Zimit 2.x which does not have any special requirements anymore.
It should also be noted that a first version of a generic HTTP scraper was created in 2016 during
the [Wikimania Esino Lario the [Wikimania Esino Lario
Hackathon](https://wikimania2016.wikimedia.org/wiki/Programme/Kiwix-dedicated_Hackathon). Hackathon](https://wikimania2016.wikimedia.org/wiki/Programme/Kiwix-dedicated_Hackathon).

224
pyproject.toml Normal file
View File

@ -0,0 +1,224 @@
[build-system]
requires = ["hatchling", "hatch-openzim==0.2.0"]
build-backend = "hatchling.build"
[project]
name = "zimit"
requires-python = ">=3.12,<3.13"
description = "Make ZIM file from any website through crawling"
readme = "README.md"
dependencies = [
"requests==2.31.0",
"inotify==0.2.10",
"tld==0.13",
"warc2zim @ git+https://github.com/openzim/warc2zim@warc2zim2",
]
dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
[tool.hatch.metadata.hooks.openzim-metadata]
kind = "scraper"
[tool.hatch.metadata]
allow-direct-references = true # to be removed once we use a released warc2zim version
[project.optional-dependencies]
scripts = [
"invoke==2.2.0",
]
lint = [
"black==24.2.0",
"ruff==0.3.0",
]
check = [
"pyright==1.1.352",
]
test = [
"pytest==8.0.2",
"coverage==7.4.3",
]
dev = [
"pre-commit==3.6.2",
"debugpy==1.8.1",
"zimit[scripts]",
"zimit[lint]",
"zimit[test]",
"zimit[check]",
]
[project.scripts]
zimit = "zimit:zimit.zimit"
[tool.hatch.version]
path = "src/zimit/__about__.py"
[tool.hatch.build]
exclude = [
"/.github",
]
[tool.hatch.build.targets.wheel]
packages = ["src/zimit"]
[tool.hatch.envs.default]
features = ["dev"]
[tool.hatch.envs.test]
features = ["scripts", "test"]
[tool.hatch.envs.test.scripts]
run = "inv test --args '{args}'"
run-cov = "inv test-cov --args '{args}'"
report-cov = "inv report-cov"
coverage = "inv coverage --args '{args}'"
html = "inv coverage --html --args '{args}'"
[tool.hatch.envs.lint]
template = "lint"
skip-install = false
features = ["scripts", "lint"]
[tool.hatch.envs.lint.scripts]
black = "inv lint-black --args '{args}'"
ruff = "inv lint-ruff --args '{args}'"
all = "inv lintall --args '{args}'"
fix-black = "inv fix-black --args '{args}'"
fix-ruff = "inv fix-ruff --args '{args}'"
fixall = "inv fixall --args '{args}'"
[tool.hatch.envs.check]
features = ["scripts", "check"]
[tool.hatch.envs.check.scripts]
pyright = "inv check-pyright --args '{args}'"
all = "inv checkall --args '{args}'"
[tool.black]
line-length = 88
target-version = ['py312']
[tool.ruff]
target-version = "py312"
line-length = 88
src = ["src"]
[tool.ruff.lint]
select = [
"A", # flake8-builtins
# "ANN", # flake8-annotations
"ARG", # flake8-unused-arguments
# "ASYNC", # flake8-async
"B", # flake8-bugbear
# "BLE", # flake8-blind-except
"C4", # flake8-comprehensions
"C90", # mccabe
# "COM", # flake8-commas
# "D", # pydocstyle
# "DJ", # flake8-django
"DTZ", # flake8-datetimez
"E", # pycodestyle (default)
"EM", # flake8-errmsg
# "ERA", # eradicate
# "EXE", # flake8-executable
"F", # Pyflakes (default)
# "FA", # flake8-future-annotations
"FBT", # flake8-boolean-trap
# "FLY", # flynt
# "G", # flake8-logging-format
"I", # isort
"ICN", # flake8-import-conventions
# "INP", # flake8-no-pep420
# "INT", # flake8-gettext
"ISC", # flake8-implicit-str-concat
"N", # pep8-naming
# "NPY", # NumPy-specific rules
# "PD", # pandas-vet
# "PGH", # pygrep-hooks
# "PIE", # flake8-pie
# "PL", # Pylint
"PLC", # Pylint: Convention
"PLE", # Pylint: Error
"PLR", # Pylint: Refactor
"PLW", # Pylint: Warning
# "PT", # flake8-pytest-style
# "PTH", # flake8-use-pathlib
# "PYI", # flake8-pyi
"Q", # flake8-quotes
# "RET", # flake8-return
# "RSE", # flake8-raise
"RUF", # Ruff-specific rules
"S", # flake8-bandit
# "SIM", # flake8-simplify
# "SLF", # flake8-self
"T10", # flake8-debugger
"T20", # flake8-print
# "TCH", # flake8-type-checking
# "TD", # flake8-todos
"TID", # flake8-tidy-imports
# "TRY", # tryceratops
"UP", # pyupgrade
"W", # pycodestyle
"YTT", # flake8-2020
]
ignore = [
# Allow non-abstract empty methods in abstract base classes
"B027",
# Remove flake8-errmsg since we consider they bloat the code and provide limited value
"EM",
# Allow boolean positional values in function calls, like `dict.get(... True)`
"FBT003",
# Ignore checks for possible passwords
"S105", "S106", "S107",
# Ignore warnings on subprocess.run / popen
"S603",
# Ignore complexity
"C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915",
]
unfixable = [
# Don't touch unused imports
"F401",
]
[tool.ruff.lint.isort]
known-first-party = ["zimit"]
[tool.ruff.lint.flake8-bugbear]
# add exceptions to B008 for fastapi.
extend-immutable-calls = ["fastapi.Depends", "fastapi.Query"]
[tool.ruff.lint.flake8-tidy-imports]
ban-relative-imports = "all"
[tool.ruff.lint.per-file-ignores]
# Tests can use magic values, assertions, and relative imports
"tests**/**/*" = ["PLR2004", "S101", "TID252"]
[tool.pytest.ini_options]
minversion = "7.3"
testpaths = ["tests"]
pythonpath = [".", "src"]
[tool.coverage.paths]
zimit = ["src/zimit"]
tests = ["tests"]
[tool.coverage.run]
source_pkgs = ["zimit"]
branch = true
parallel = true
omit = [
"src/zimit/__about__.py",
]
[tool.coverage.report]
exclude_lines = [
"no cov",
"if __name__ == .__main__.:",
"if TYPE_CHECKING:",
]
[tool.pyright]
include = ["src", "tests", "tasks.py"]
exclude = [".env/**", ".venv/**"]
extraPaths = ["src"]
pythonVersion = "3.12"
typeCheckingMode="basic"

1
src/zimit/__about__.py Normal file
View File

@ -0,0 +1 @@
__version__ = "2.0.0-dev4"

View File

@ -1,7 +1,3 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# vim: ai ts=4 sts=4 et sw=4 nu
""" """
Main zimit run script Main zimit run script
This script validates arguments with warc2zim, checks permissions This script validates arguments with warc2zim, checks permissions
@ -11,6 +7,7 @@ and then calls the Node based driver
import atexit import atexit
import itertools import itertools
import json import json
import logging
import shutil import shutil
import signal import signal
import subprocess import subprocess
@ -23,19 +20,24 @@ from pathlib import Path
import inotify import inotify
import inotify.adapters import inotify.adapters
import requests
from tld import get_fld
from warc2zim.main import main as warc2zim from warc2zim.main import main as warc2zim
from zimscraperlib.logging import getLogger
from zimscraperlib.uri import rebuild_uri from zimscraperlib.uri import rebuild_uri
DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15" from zimit.__about__ import __version__
EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
EXIT_CODE_CRAWLER_LIMIT_HIT = 11
NORMAL_WARC2ZIM_EXIT_CODE = 100
logger = getLogger(name="zimit", level=logging.INFO)
class ProgressFileWatcher: class ProgressFileWatcher:
def __init__(self, output_dir, stats_path): def __init__(self, output_dir: Path, stats_path: Path):
self.crawl_path = output_dir / "crawl.json" self.crawl_path = output_dir / "crawl.json"
self.warc2zim_path = output_dir / "warc2zim.json" self.warc2zim_path = output_dir / "warc2zim.json"
self.stats_path = Path(stats_path) self.stats_path = stats_path
if not self.stats_path.is_absolute(): if not self.stats_path.is_absolute():
self.stats_path = output_dir / self.stats_path self.stats_path = output_dir / self.stats_path
@ -46,6 +48,8 @@ class ProgressFileWatcher:
self.process = None self.process = None
def stop(self): def stop(self):
if not self.process:
return
self.process.join(0.1) self.process.join(0.1)
self.process.terminate() self.process.terminate()
@ -58,10 +62,10 @@ class ProgressFileWatcher:
self.process.start() self.process.start()
@staticmethod @staticmethod
def inotify_watcher(crawl_fpath, warc2zim_fpath, output_fpath): def inotify_watcher(crawl_fpath: str, warc2zim_fpath: str, output_fpath: str):
ino = inotify.adapters.Inotify() ino = inotify.adapters.Inotify()
ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY) ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY) # pyright: ignore
ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY) ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY) # pyright: ignore
class Limit: class Limit:
def __init__(self): def __init__(self):
@ -97,15 +101,15 @@ class ProgressFileWatcher:
"limit": limit.as_dict, "limit": limit.as_dict,
} }
for _, _, fpath, _ in ino.event_gen(yield_nones=False): for _, _, fpath, _ in ino.event_gen(yield_nones=False): # pyright: ignore
func = {crawl_fpath: crawl_conv, warc2zim_fpath: warc2zim_conv}.get(fpath) func = {crawl_fpath: crawl_conv, warc2zim_fpath: warc2zim_conv}.get(fpath)
if not func: if not func:
continue continue
# open input and output separatly as to not clear output on error # open input and output separatly as to not clear output on error
with open(fpath, "r") as ifh: with open(fpath) as ifh:
try: try:
out = func(json.load(ifh), limit) out = func(json.load(ifh), limit)
except Exception: # nosec except Exception: # nosec # noqa: S112
# simply ignore progress update should an error arise # simply ignore progress update should an error arise
# might be malformed input for instance # might be malformed input for instance
continue continue
@ -115,7 +119,7 @@ class ProgressFileWatcher:
json.dump(out, ofh) json.dump(out, ofh)
def zimit(args=None): def run(raw_args):
wait_until_options = ["load", "domcontentloaded", "networkidle"] wait_until_options = ["load", "domcontentloaded", "networkidle"]
wait_until_all = wait_until_options + [ wait_until_all = wait_until_options + [
f"{a},{b}" for a, b in itertools.combinations(wait_until_options, 2) f"{a},{b}" for a, b in itertools.combinations(wait_until_options, 2)
@ -131,7 +135,7 @@ def zimit(args=None):
parser.add_argument( parser.add_argument(
"--urlFile", "--urlFile",
help="If set, read a list of seed urls, " "one per line, from the specified", help="If set, read a list of seed urls, one per line, from the specified",
) )
parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers") parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers")
@ -205,7 +209,8 @@ def zimit(args=None):
parser.add_argument( parser.add_argument(
"--lang", "--lang",
help="if set, sets the language used by the browser, should be ISO 639 language[-country] code", help="if set, sets the language used by the browser, should be ISO 639 "
"language[-country] code",
) )
parser.add_argument( parser.add_argument(
@ -220,12 +225,21 @@ def zimit(args=None):
help="Emulate mobile device by name from " help="Emulate mobile device by name from "
"https://github.com/puppeteer/puppeteer/blob/" "https://github.com/puppeteer/puppeteer/blob/"
"main/packages/puppeteer-core/src/common/Device.ts", "main/packages/puppeteer-core/src/common/Device.ts",
default="Pixel 2",
)
parser.add_argument(
"--noMobileDevice",
help="Do not emulate a mobile device (use at your own risk, behavior is"
"uncertain)",
action="store_true",
default=False,
) )
parser.add_argument( parser.add_argument(
"--userAgent", "--userAgent",
help="Override default user-agent with specified value ; --userAgentSuffix is still applied", help="Override default user-agent with specified value ; --userAgentSuffix and "
default=DEFAULT_USER_AGENT, "--adminEmail have no effect when this is set",
) )
parser.add_argument( parser.add_argument(
@ -333,7 +347,38 @@ def zimit(args=None):
"to configure the crawling behaviour if not set via argument.", "to configure the crawling behaviour if not set via argument.",
) )
zimit_args, warc2zim_args = parser.parse_known_args(args) parser.add_argument(
"--version",
help="Display scraper version and exit",
action="version",
version=f"Zimit {__version__}",
)
parser.add_argument(
"--logging",
help="Crawler logging configuration",
)
zimit_args, warc2zim_args = parser.parse_known_args(raw_args)
logger.info("Checking browsertrix-crawler version")
crawl_version_cmd = ["crawl", "--version"]
try:
crawl = subprocess.run(
crawl_version_cmd, check=True, capture_output=True, text=True
)
except Exception:
logger.error("Failed to get Browsertrix crawler version")
raise
crawler_version = crawl.stdout.strip()
logger.info(f"Browsertrix crawler: version {crawler_version}")
# pass a scraper suffix to warc2zim so that both zimit, warc2zim and crawler
# versions are associated with the ZIM
warc2zim_args.append("--scraper-suffix")
warc2zim_args.append(
f" + zimit {__version__} + Browsertrix crawler {crawler_version}"
)
# pass url and output to warc2zim also # pass url and output to warc2zim also
if zimit_args.output: if zimit_args.output:
@ -342,14 +387,12 @@ def zimit(args=None):
url = zimit_args.url url = zimit_args.url
user_agent = zimit_args.userAgent user_agent_suffix = zimit_args.userAgentSuffix
if zimit_args.userAgentSuffix:
user_agent += f" {zimit_args.userAgentSuffix}"
if zimit_args.adminEmail: if zimit_args.adminEmail:
user_agent += f" {zimit_args.adminEmail}" user_agent_suffix += f" {zimit_args.adminEmail}"
if url: if url:
url = check_url(url, user_agent, zimit_args.scopeType) url = get_cleaned_url(url)
warc2zim_args.append("--url") warc2zim_args.append("--url")
warc2zim_args.append(url) warc2zim_args.append(url)
@ -372,13 +415,13 @@ def zimit(args=None):
warc2zim_args.append("--lang") warc2zim_args.append("--lang")
warc2zim_args.append(zimit_args.zim_lang) warc2zim_args.append(zimit_args.zim_lang)
print("----------") logger.info("----------")
print("Testing warc2zim args") logger.info("Testing warc2zim args")
print("Running: warc2zim " + " ".join(warc2zim_args), flush=True) logger.info("Running: warc2zim " + " ".join(warc2zim_args))
res = warc2zim(warc2zim_args) res = warc2zim(warc2zim_args)
if res != 100: if res != NORMAL_WARC2ZIM_EXIT_CODE:
print("Exiting, invalid warc2zim params") logger.info("Exiting, invalid warc2zim params")
return 2 return EXIT_CODE_WARC2ZIM_CHECK_FAILED
# make temp dir for this crawl # make temp dir for this crawl
if zimit_args.build: if zimit_args.build:
@ -389,9 +432,9 @@ def zimit(args=None):
if not zimit_args.keep: if not zimit_args.keep:
def cleanup(): def cleanup():
print("") logger.info("")
print("----------") logger.info("----------")
print(f"Cleanup, removing temp dir: {temp_root_dir}", flush=True) logger.info(f"Cleanup, removing temp dir: {temp_root_dir}")
shutil.rmtree(temp_root_dir) shutil.rmtree(temp_root_dir)
atexit.register(cleanup) atexit.register(cleanup)
@ -401,8 +444,12 @@ def zimit(args=None):
cmd_args.append("--url") cmd_args.append("--url")
cmd_args.append(url) cmd_args.append(url)
cmd_args.append("--userAgent") cmd_args.append("--userAgentSuffix")
cmd_args.append(user_agent) cmd_args.append(user_agent_suffix)
if not zimit_args.noMobileDevice:
cmd_args.append("--mobileDevice")
cmd_args.append(zimit_args.mobileDevice)
cmd_args.append("--cwd") cmd_args.append("--cwd")
cmd_args.append(str(temp_root_dir)) cmd_args.append(str(temp_root_dir))
@ -412,7 +459,7 @@ def zimit(args=None):
watcher = ProgressFileWatcher( watcher = ProgressFileWatcher(
Path(zimit_args.output), Path(zimit_args.statsFilename) Path(zimit_args.output), Path(zimit_args.statsFilename)
) )
print(f"Writing progress to {watcher.stats_path}") logger.info(f"Writing progress to {watcher.stats_path}")
# update crawler command # update crawler command
cmd_args.append("--statsFilename") cmd_args.append("--statsFilename")
cmd_args.append(str(watcher.crawl_path)) cmd_args.append(str(watcher.crawl_path))
@ -424,15 +471,16 @@ def zimit(args=None):
cmd_line = " ".join(cmd_args) cmd_line = " ".join(cmd_args)
print("") logger.info("")
print("----------") logger.info("----------")
print( logger.info(
f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}" f"Output to tempdir: {temp_root_dir} - "
f"{'will keep' if zimit_args.keep else 'will delete'}"
) )
print(f"Running browsertrix-crawler crawl: {cmd_line}", flush=True) logger.info(f"Running browsertrix-crawler crawl: {cmd_line}")
crawl = subprocess.run(cmd_args) crawl = subprocess.run(cmd_args, check=False)
if crawl.returncode == 11: if crawl.returncode == EXIT_CODE_CRAWLER_LIMIT_HIT:
print("crawl interupted by a limit") logger.info("crawl interupted by a limit")
elif crawl.returncode != 0: elif crawl.returncode != 0:
raise subprocess.CalledProcessError(crawl.returncode, cmd_args) raise subprocess.CalledProcessError(crawl.returncode, cmd_args)
@ -447,65 +495,33 @@ def zimit(args=None):
"Failed to find directory where WARC files have been created" "Failed to find directory where WARC files have been created"
) )
elif len(warc_dirs) > 1: elif len(warc_dirs) > 1:
print("Found many WARC files directories, only last one will be used") logger.info("Found many WARC files directories, only last one will be used")
for directory in warc_dirs: for directory in warc_dirs:
print(f"- {directory}") logger.info(f"- {directory}")
warc_directory = warc_dirs[-1] warc_directory = warc_dirs[-1]
print("") logger.info("")
print("----------") logger.info("----------")
print(f"Processing WARC files in {warc_directory}") logger.info(f"Processing WARC files in {warc_directory}")
warc2zim_args.append(str(warc_directory)) warc2zim_args.append(str(warc_directory))
num_files = sum(1 for _ in warc_directory.iterdir()) num_files = sum(1 for _ in warc_directory.iterdir())
print(f"{num_files} WARC files found", flush=True) logger.info(f"{num_files} WARC files found")
print(f"Calling warc2zim with these args: {warc2zim_args}", flush=True) logger.info(f"Calling warc2zim with these args: {warc2zim_args}")
return warc2zim(warc2zim_args) return warc2zim(warc2zim_args)
def check_url(url, user_agent, scope=None): def get_cleaned_url(url: str):
url = urllib.parse.urlparse(url) parsed_url = urllib.parse.urlparse(url)
try:
with requests.get(
url.geturl(),
stream=True,
allow_redirects=True,
timeout=(12.2, 27),
headers={"User-Agent": user_agent},
) as resp:
resp.raise_for_status()
except requests.exceptions.RequestException as exc:
print(f"failed to connect to {url.geturl()}: {exc}", flush=True)
raise SystemExit(1)
actual_url = urllib.parse.urlparse(resp.url)
# remove explicit port in URI for default-for-scheme as browsers does it # remove explicit port in URI for default-for-scheme as browsers does it
if actual_url.scheme == "https" and actual_url.port == 443: if parsed_url.scheme == "https" and parsed_url.port == 443: # noqa: PLR2004
actual_url = rebuild_uri(actual_url, port="") parsed_url = rebuild_uri(parsed_url, port="")
if actual_url.scheme == "http" and actual_url.port == 80: if parsed_url.scheme == "http" and parsed_url.port == 80: # noqa: PLR2004
actual_url = rebuild_uri(actual_url, port="") parsed_url = rebuild_uri(parsed_url, port="")
if actual_url.geturl() != url.geturl(): return parsed_url.geturl()
if scope in (None, "any"):
return actual_url.geturl()
print(
"[WARN] Your URL ({0}) redirects to {1} which {2} on same "
"first-level domain. Depending on your scopeType ({3}), "
"your homepage might be out-of-scope. Please check!".format(
url.geturl(),
actual_url.geturl(),
"is"
if get_fld(url.geturl()) == get_fld(actual_url.geturl())
else "is not",
scope,
)
)
return actual_url.geturl()
return url.geturl()
def get_node_cmd_line(args): def get_node_cmd_line(args):
@ -527,7 +543,7 @@ def get_node_cmd_line(args):
"collection", "collection",
"allowHashUrls", "allowHashUrls",
"lang", "lang",
"mobileDevice", "userAgent",
"useSitemap", "useSitemap",
"behaviors", "behaviors",
"behaviorTimeout", "behaviorTimeout",
@ -539,9 +555,10 @@ def get_node_cmd_line(args):
"healthCheckPort", "healthCheckPort",
"overwrite", "overwrite",
"config", "config",
"logging",
]: ]:
value = getattr(args, arg) value = getattr(args, arg)
if value == None or (isinstance(value, bool) and value == False): if value is None or (isinstance(value, bool) and value is False):
continue continue
node_cmd.append("--" + arg) node_cmd.append("--" + arg)
if not isinstance(value, bool): if not isinstance(value, bool):
@ -550,17 +567,22 @@ def get_node_cmd_line(args):
return node_cmd return node_cmd
def sigint_handler(*args): def sigint_handler(*args): # noqa: ARG001
print("") logger.info("")
print("") logger.info("")
print("SIGINT/SIGTERM received, stopping zimit") logger.info("SIGINT/SIGTERM received, stopping zimit")
print("") logger.info("")
print("", flush=True) logger.info("")
sys.exit(3) sys.exit(3)
def zimit():
run(sys.argv[1:])
signal.signal(signal.SIGINT, sigint_handler) signal.signal(signal.SIGINT, sigint_handler)
signal.signal(signal.SIGTERM, sigint_handler) signal.signal(signal.SIGTERM, sigint_handler)
if __name__ == "__main__": if __name__ == "__main__":
zimit() zimit()

109
tasks.py Normal file
View File

@ -0,0 +1,109 @@
# pyright: strict, reportUntypedFunctionDecorator=false
import os
from invoke.context import Context
from invoke.tasks import task # pyright: ignore [reportUnknownVariableType]
use_pty = not os.getenv("CI", "")
@task(optional=["args"], help={"args": "pytest additional arguments"})
def test(ctx: Context, args: str = ""):
"""run tests (without coverage)"""
ctx.run(f"pytest {args}", pty=use_pty)
@task(optional=["args"], help={"args": "pytest additional arguments"})
def test_cov(ctx: Context, args: str = ""):
"""run test vith coverage"""
ctx.run(f"coverage run -m pytest {args}", pty=use_pty)
@task(optional=["html"], help={"html": "flag to export html report"})
def report_cov(ctx: Context, *, html: bool = False):
"""report coverage"""
ctx.run("coverage combine", warn=True, pty=use_pty)
ctx.run("coverage report --show-missing", pty=use_pty)
if html:
ctx.run("coverage html", pty=use_pty)
@task(
optional=["args", "html"],
help={
"args": "pytest additional arguments",
"html": "flag to export html report",
},
)
def coverage(ctx: Context, args: str = "", *, html: bool = False):
"""run tests and report coverage"""
test_cov(ctx, args=args)
report_cov(ctx, html=html)
@task(optional=["args"], help={"args": "black additional arguments"})
def lint_black(ctx: Context, args: str = "."):
args = args or "." # needed for hatch script
ctx.run("black --version", pty=use_pty)
ctx.run(f"black --check --diff {args}", pty=use_pty)
@task(optional=["args"], help={"args": "ruff additional arguments"})
def lint_ruff(ctx: Context, args: str = "."):
args = args or "." # needed for hatch script
ctx.run("ruff --version", pty=use_pty)
ctx.run(f"ruff check {args}", pty=use_pty)
@task(
optional=["args"],
help={
"args": "linting tools (black, ruff) additional arguments, typically a path",
},
)
def lintall(ctx: Context, args: str = "."):
"""Check linting"""
args = args or "." # needed for hatch script
lint_black(ctx, args)
lint_ruff(ctx, args)
@task(optional=["args"], help={"args": "check tools (pyright) additional arguments"})
def check_pyright(ctx: Context, args: str = ""):
"""check static types with pyright"""
ctx.run("pyright --version")
ctx.run(f"pyright {args}", pty=use_pty)
@task(optional=["args"], help={"args": "check tools (pyright) additional arguments"})
def checkall(ctx: Context, args: str = ""):
"""check static types"""
check_pyright(ctx, args)
@task(optional=["args"], help={"args": "black additional arguments"})
def fix_black(ctx: Context, args: str = "."):
"""fix black formatting"""
args = args or "." # needed for hatch script
ctx.run(f"black {args}", pty=use_pty)
@task(optional=["args"], help={"args": "ruff additional arguments"})
def fix_ruff(ctx: Context, args: str = "."):
"""fix all ruff rules"""
args = args or "." # needed for hatch script
ctx.run(f"ruff check --fix {args}", pty=use_pty)
@task(
optional=["args"],
help={
"args": "linting tools (black, ruff) additional arguments, typically a path",
},
)
def fixall(ctx: Context, args: str = "."):
"""Fix everything automatically"""
args = args or "." # needed for hatch script
fix_black(ctx, args)
fix_ruff(ctx, args)
lintall(ctx, args)

View File

@ -0,0 +1 @@
These are integration tests, meant to be ran inside the CI (because we need to first perform a zimit run on a given website and then check its output)

View File

@ -1,14 +1,9 @@
import os
import glob import glob
import json import json
import os
import libzim.reader
from warcio import ArchiveIterator from warcio import ArchiveIterator
from zimscraperlib.zim import Archive
def get_zim_article(zimfile, path):
zim_fh = libzim.reader.Archive(zimfile)
return zim_fh.get_entry_by_path(path).get_item().content.tobytes()
def test_is_file(): def test_is_file():
@ -20,20 +15,34 @@ def test_zim_main_page():
"""Main page specified, http://isago.rskg.org/, was a redirect to https """Main page specified, http://isago.rskg.org/, was a redirect to https
Ensure main page is the redirected page""" Ensure main page is the redirected page"""
assert b'"https://isago.rskg.org/"' in get_zim_article( main_entry = Archive("/output/isago.zim").main_entry
"/output/isago.zim", "A/index.html" assert main_entry.is_redirect
) assert main_entry.get_redirect_entry().path == "isago.rskg.org/"
def test_zim_scraper():
"""Main page specified, http://isago.rskg.org/, was a redirect to https
Ensure main page is the redirected page"""
zim_fh = Archive("/output/isago.zim")
scraper = zim_fh.get_text_metadata("Scraper")
assert "zimit " in scraper
assert "warc2zim " in scraper
assert "Browsertrix crawler " in scraper
def test_user_agent(): def test_user_agent():
"""Test that mobile user agent was used in WARC request records with custom Zimit and email suffix""" """Test that mobile user agent was used
Check is done in WARC request records with custom Zimit and email suffix
"""
found = False found = False
for warc in glob.glob("/output/.tmp*/collections/crawl-*/archive/*.warc.gz"): for warc in glob.glob("/output/.tmp*/collections/crawl-*/archive/*.warc.gz"):
with open(warc, "rb") as fh: with open(warc, "rb") as fh:
for record in ArchiveIterator(fh): for record in ArchiveIterator(fh):
if record.rec_type == "request": if record.rec_type == "request":
print(record.http_headers) print(record.http_headers) # noqa: T201
ua = record.http_headers.get_header("User-Agent") ua = record.http_headers.get_header("User-Agent")
if ua: if ua:
assert "Mozilla" in ua assert "Mozilla" in ua
@ -56,12 +65,12 @@ def test_stats_output():
} }
with open("/output/warc2zim.json") as fh: with open("/output/warc2zim.json") as fh:
assert json.loads(fh.read()) == { assert json.loads(fh.read()) == {
"written": 8, "written": 7,
"total": 8, "total": 7,
} }
with open("/output/stats.json") as fh: with open("/output/stats.json") as fh:
assert json.loads(fh.read()) == { assert json.loads(fh.read()) == {
"done": 8, "done": 7,
"total": 8, "total": 7,
"limit": {"max": 0, "hit": False}, "limit": {"max": 0, "hit": False},
} }

6
tests/test_dummy.py Normal file
View File

@ -0,0 +1,6 @@
from zimit.zimit import NORMAL_WARC2ZIM_EXIT_CODE
# dummy test, just to have coverage report done
def test_something_exists():
assert NORMAL_WARC2ZIM_EXIT_CODE