Merge branch 'zimit2'

This commit is contained in:
benoit74 2024-05-24 14:07:05 +00:00
commit ce49a5d4e9
No known key found for this signature in database
GPG Key ID: B89606434FC7B530
18 changed files with 753 additions and 188 deletions

View File

@ -1,2 +0,0 @@
output/
node_modules/

View File

@ -1,26 +1,20 @@
name: Docker
name: Publish released version
on:
push:
branches:
- main
tags:
- v*
release:
types: [published]
jobs:
build-and-push:
name: Deploy Docker Image
publish:
runs-on: ubuntu-22.04
steps:
- name: Retrieve source code
uses: actions/checkout@v3
- uses: actions/checkout@v3
- name: Build and push
- name: Build and push Docker image
uses: openzim/docker-publish-action@v10
with:
image-name: openzim/zimit
on-master: dev
tag-pattern: /^v([0-9.]+)$/
latest-on-tag: true
restrict-to: openzim/zimit

View File

@ -0,0 +1,30 @@
name: Publish Docker dev image
on:
push:
branches:
- main
jobs:
publish:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
- name: Build and push Docker image
uses: openzim/docker-publish-action@v10
with:
image-name: openzim/zimit
manual-tag: dev
latest-on-tag: false
restrict-to: openzim/zimit
registries: ghcr.io
credentials:
GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
repo_description: auto
repo_overview: auto
platforms: |
linux/amd64
linux/arm64

View File

@ -0,0 +1,30 @@
name: Publish Docker zimit2 image
on:
push:
branches:
- zimit2
jobs:
publish:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
- name: Build and push Docker image
uses: openzim/docker-publish-action@v10
with:
image-name: openzim/zimit
manual-tag: zimit2
latest-on-tag: false
restrict-to: openzim/zimit
registries: ghcr.io
credentials:
GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
repo_description: auto
repo_overview: auto
platforms: |
linux/amd64
linux/arm64

34
.github/workflows/QA.yaml vendored Normal file
View File

@ -0,0 +1,34 @@
name: QA
on:
pull_request:
push:
branches:
- main
jobs:
check-qa:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version-file: pyproject.toml
architecture: x64
- name: Install dependencies (and project)
run: |
pip install -U pip
pip install -e .[lint,scripts,test,check]
- name: Check black formatting
run: inv lint-black
- name: Check ruff
run: inv lint-ruff
- name: Check pyright
run: inv check-pyright

66
.github/workflows/Tests.yaml vendored Normal file
View File

@ -0,0 +1,66 @@
name: Tests
on:
pull_request:
push:
branches:
- main
jobs:
run-tests:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version-file: pyproject.toml
architecture: x64
- name: Install dependencies (and project)
run: |
pip install -U pip
pip install -e .[test,scripts]
- name: Run the tests
run: inv coverage --args "-vvv"
- name: Upload coverage report to codecov
uses: codecov/codecov-action@v3
with:
token: ${{ secrets.CODECOV_TOKEN }}
build_python:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version-file: pyproject.toml
architecture: x64
- name: Ensure we can build Python targets
run: |
pip install -U pip build
python3 -m build --sdist --wheel
# this job replaces the standard "build_docker" job since it builds the docker image
run-integration-tests:
runs-on: ubuntu-22.04
steps:
- name: checkout
uses: actions/checkout@v3
- name: build image
run: docker build -t zimit .
- name: run crawl
run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
- name: run integration test suite
run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py"

View File

@ -1,20 +0,0 @@
name: CI
on: push
jobs:
integration-tests:
runs-on: ubuntu-22.04
steps:
- name: checkout
uses: actions/checkout@v3
- name: build image
run: docker build -t zimit .
- name: run crawl
run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
- name: run integration test suite
run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v ./integration.py"

27
.pre-commit-config.yaml Normal file
View File

@ -0,0 +1,27 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- repo: https://github.com/psf/black
rev: "24.2.0"
hooks:
- id: black
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.3.0
hooks:
- id: ruff
- repo: https://github.com/RobertCraigie/pyright-python
rev: v1.1.352
hooks:
- id: pyright
name: pyright (system)
description: 'pyright static type checker'
entry: pyright
language: system
'types_or': [python, pyi]
require_serial: true
minimum_pre_commit_version: '2.9.2'

View File

@ -7,6 +7,28 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
### Added
- New `--version` flag to display Zimit version
- New `--logging` flag to adjust Browsertrix Crawler logging (#273)
- Use new `--scraper-suffix` flag of warc2zim to enhance ZIM "Scraper" metadata (#275)
- New `--noMobileDevice` CLI argument
### Changed
- Use `warc2zim` version 2, which works without Service Worker anymore
- Using `warc2zim2` warc2zim ⚠️ change before releasing!
- Build temporary `zimit2` Docker image for testing ⚠️ remove before releasing!
- Adopt Python bootstrap conventions
- Removed handling of redirects by zimit, they are handled by browsertrix crawler and detected properly by warc2zim
- Upgrade to Python 3.12 + upgrade dependencies
- `--userAgent` CLI argument overrides again the `--userAgentSuffix` and `--adminEmail` values
- `--userAgent` CLI arguement is not mandatory anymore
- Upgraded Browsertrix Crawler to 1.0.3
### Fixed
- Fix support for Youtube videos (#291)
## [1.6.3] - 2024-01-18

View File

@ -1,36 +1,48 @@
FROM webrecorder/browsertrix-crawler:0.12.4
FROM webrecorder/browsertrix-crawler:1.1.1
LABEL org.opencontainers.image.source https://github.com/openzim/zimit
# add deadsnakes ppa for Python 3.12 on Ubuntu Jammy
RUN add-apt-repository ppa:deadsnakes/ppa -y
RUN apt-get update \
&& apt-get install -qqy --no-install-recommends \
libmagic1 \
python3.10-venv \
python3.12-venv \
&& rm -rf /var/lib/apt/lists/* \
# python setup (in venv not to conflict with browsertrix)
&& python3 -m venv /app/zimit \
&& /app/zimit/bin/python -m pip install --no-cache-dir 'requests==2.31.0' 'inotify==0.2.10' 'tld==0.13' \
'warc2zim==1.5.5' \
&& python3.12 -m venv /app/zimit \
# placeholder (default output location)
&& mkdir -p /output \
# disable chrome upgrade
&& printf "repo_add_once=\"false\"\nrepo_reenable_on_distupgrade=\"false\"\n" > /etc/default/google-chrome \
# download list of bad domains to filter-out. intentionnaly ran post-install \
# so it's not cached in earlier layers (url stays same but content updated) \
mkdir -p /tmp/ads && cd /tmp/ads && \
curl -L -O https://hosts.anudeep.me/mirror/adservers.txt && \
curl -L -O https://hosts.anudeep.me/mirror/CoinMiner.txt && \
curl -L -O https://hosts.anudeep.me/mirror/facebook.txt && \
cat ./*.txt > /etc/blocklist.txt \
&& mkdir -p /tmp/ads \
&& cd /tmp/ads \
&& curl -L -O https://hosts.anudeep.me/mirror/adservers.txt \
&& curl -L -O https://hosts.anudeep.me/mirror/CoinMiner.txt \
&& curl -L -O https://hosts.anudeep.me/mirror/facebook.txt \
&& cat ./*.txt > /etc/blocklist.txt \
&& rm ./*.txt \
&& printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh && \
chmod +x /usr/local/bin/entrypoint.sh
&& printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh \
&& chmod +x /usr/local/bin/entrypoint.sh
WORKDIR /app
ADD zimit.py /app/
# fix shebang on zimit to use in-venv python
RUN sed -i.bak "1 s/.*/#!\/app\/zimit\/bin\/python3/" /app/zimit.py \
&& ln -s /app/zimit.py /usr/bin/zimit \
&& chmod +x /usr/bin/zimit
# Copy pyproject.toml and its dependencies
COPY pyproject.toml README.md /src/
COPY src/zimit/__about__.py /src/src/zimit/__about__.py
# Install Python dependencies
RUN . /app/zimit/bin/activate && python -m pip install --no-cache-dir /src
# Copy code + associated artifacts
COPY src /src/src
COPY *.md /src/
# Install + cleanup
RUN . /app/zimit/bin/activate && python -m pip install --no-cache-dir /src \
&& ln -s /app/zimit/bin/zimit /usr/bin/zimit \
&& chmod +x /usr/bin/zimit \
&& rm -rf /src
ENTRYPOINT ["entrypoint.sh"]
CMD ["zimit"]
CMD ["zimit", "--help"]

View File

@ -3,12 +3,9 @@ Zimit
Zimit is a scraper allowing to create ZIM file from any Web site.
[![Docker](https://ghcr-badge.deta.dev/openzim/zimit/latest_tag?label=docker)](https://ghcr.io/openzim/zimit)
[![Build](https://github.com/openzim/zimit/workflows/CI/badge.svg?query=branch%3Amain)](https://github.com/openzim/zimit/actions?query=branch%3Amain)
[![CodeFactor](https://www.codefactor.io/repository/github/openzim/zimit/badge)](https://www.codefactor.io/repository/github/openzim/zimit)
[![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0)
⚠️ **Important**: this tool uses [warc2zim](https://github.com/openzim/warc2zim) to create Zim files and thus require the Zim reader to support *Service Workers*. At the time of `zimit:1.0`, that's mostly kiwix-android and kiwix-serve. Note that service workers have protocol restrictions as well so you'll need to run it either from `localhost` or over HTTPS.
[![Docker](https://ghcr-badge.deta.dev/openzim/zimit/latest_tag?label=docker)](https://ghcr.io/openzim/zimit)
Technical background
--------------------
@ -68,7 +65,10 @@ default and prints the crawl status to the Docker log.
Nota bene
---------
A first version of a generic HTTP scraper was created in 2016 during
While Zimit 1.x relied on a Service Worker to display the ZIM content, this is not anymore the case
since Zimit 2.x which does not have any special requirements anymore.
It should also be noted that a first version of a generic HTTP scraper was created in 2016 during
the [Wikimania Esino Lario
Hackathon](https://wikimania2016.wikimedia.org/wiki/Programme/Kiwix-dedicated_Hackathon).

224
pyproject.toml Normal file
View File

@ -0,0 +1,224 @@
[build-system]
requires = ["hatchling", "hatch-openzim==0.2.0"]
build-backend = "hatchling.build"
[project]
name = "zimit"
requires-python = ">=3.12,<3.13"
description = "Make ZIM file from any website through crawling"
readme = "README.md"
dependencies = [
"requests==2.31.0",
"inotify==0.2.10",
"tld==0.13",
"warc2zim @ git+https://github.com/openzim/warc2zim@warc2zim2",
]
dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
[tool.hatch.metadata.hooks.openzim-metadata]
kind = "scraper"
[tool.hatch.metadata]
allow-direct-references = true # to be removed once we use a released warc2zim version
[project.optional-dependencies]
scripts = [
"invoke==2.2.0",
]
lint = [
"black==24.2.0",
"ruff==0.3.0",
]
check = [
"pyright==1.1.352",
]
test = [
"pytest==8.0.2",
"coverage==7.4.3",
]
dev = [
"pre-commit==3.6.2",
"debugpy==1.8.1",
"zimit[scripts]",
"zimit[lint]",
"zimit[test]",
"zimit[check]",
]
[project.scripts]
zimit = "zimit:zimit.zimit"
[tool.hatch.version]
path = "src/zimit/__about__.py"
[tool.hatch.build]
exclude = [
"/.github",
]
[tool.hatch.build.targets.wheel]
packages = ["src/zimit"]
[tool.hatch.envs.default]
features = ["dev"]
[tool.hatch.envs.test]
features = ["scripts", "test"]
[tool.hatch.envs.test.scripts]
run = "inv test --args '{args}'"
run-cov = "inv test-cov --args '{args}'"
report-cov = "inv report-cov"
coverage = "inv coverage --args '{args}'"
html = "inv coverage --html --args '{args}'"
[tool.hatch.envs.lint]
template = "lint"
skip-install = false
features = ["scripts", "lint"]
[tool.hatch.envs.lint.scripts]
black = "inv lint-black --args '{args}'"
ruff = "inv lint-ruff --args '{args}'"
all = "inv lintall --args '{args}'"
fix-black = "inv fix-black --args '{args}'"
fix-ruff = "inv fix-ruff --args '{args}'"
fixall = "inv fixall --args '{args}'"
[tool.hatch.envs.check]
features = ["scripts", "check"]
[tool.hatch.envs.check.scripts]
pyright = "inv check-pyright --args '{args}'"
all = "inv checkall --args '{args}'"
[tool.black]
line-length = 88
target-version = ['py312']
[tool.ruff]
target-version = "py312"
line-length = 88
src = ["src"]
[tool.ruff.lint]
select = [
"A", # flake8-builtins
# "ANN", # flake8-annotations
"ARG", # flake8-unused-arguments
# "ASYNC", # flake8-async
"B", # flake8-bugbear
# "BLE", # flake8-blind-except
"C4", # flake8-comprehensions
"C90", # mccabe
# "COM", # flake8-commas
# "D", # pydocstyle
# "DJ", # flake8-django
"DTZ", # flake8-datetimez
"E", # pycodestyle (default)
"EM", # flake8-errmsg
# "ERA", # eradicate
# "EXE", # flake8-executable
"F", # Pyflakes (default)
# "FA", # flake8-future-annotations
"FBT", # flake8-boolean-trap
# "FLY", # flynt
# "G", # flake8-logging-format
"I", # isort
"ICN", # flake8-import-conventions
# "INP", # flake8-no-pep420
# "INT", # flake8-gettext
"ISC", # flake8-implicit-str-concat
"N", # pep8-naming
# "NPY", # NumPy-specific rules
# "PD", # pandas-vet
# "PGH", # pygrep-hooks
# "PIE", # flake8-pie
# "PL", # Pylint
"PLC", # Pylint: Convention
"PLE", # Pylint: Error
"PLR", # Pylint: Refactor
"PLW", # Pylint: Warning
# "PT", # flake8-pytest-style
# "PTH", # flake8-use-pathlib
# "PYI", # flake8-pyi
"Q", # flake8-quotes
# "RET", # flake8-return
# "RSE", # flake8-raise
"RUF", # Ruff-specific rules
"S", # flake8-bandit
# "SIM", # flake8-simplify
# "SLF", # flake8-self
"T10", # flake8-debugger
"T20", # flake8-print
# "TCH", # flake8-type-checking
# "TD", # flake8-todos
"TID", # flake8-tidy-imports
# "TRY", # tryceratops
"UP", # pyupgrade
"W", # pycodestyle
"YTT", # flake8-2020
]
ignore = [
# Allow non-abstract empty methods in abstract base classes
"B027",
# Remove flake8-errmsg since we consider they bloat the code and provide limited value
"EM",
# Allow boolean positional values in function calls, like `dict.get(... True)`
"FBT003",
# Ignore checks for possible passwords
"S105", "S106", "S107",
# Ignore warnings on subprocess.run / popen
"S603",
# Ignore complexity
"C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915",
]
unfixable = [
# Don't touch unused imports
"F401",
]
[tool.ruff.lint.isort]
known-first-party = ["zimit"]
[tool.ruff.lint.flake8-bugbear]
# add exceptions to B008 for fastapi.
extend-immutable-calls = ["fastapi.Depends", "fastapi.Query"]
[tool.ruff.lint.flake8-tidy-imports]
ban-relative-imports = "all"
[tool.ruff.lint.per-file-ignores]
# Tests can use magic values, assertions, and relative imports
"tests**/**/*" = ["PLR2004", "S101", "TID252"]
[tool.pytest.ini_options]
minversion = "7.3"
testpaths = ["tests"]
pythonpath = [".", "src"]
[tool.coverage.paths]
zimit = ["src/zimit"]
tests = ["tests"]
[tool.coverage.run]
source_pkgs = ["zimit"]
branch = true
parallel = true
omit = [
"src/zimit/__about__.py",
]
[tool.coverage.report]
exclude_lines = [
"no cov",
"if __name__ == .__main__.:",
"if TYPE_CHECKING:",
]
[tool.pyright]
include = ["src", "tests", "tasks.py"]
exclude = [".env/**", ".venv/**"]
extraPaths = ["src"]
pythonVersion = "3.12"
typeCheckingMode="basic"

1
src/zimit/__about__.py Normal file
View File

@ -0,0 +1 @@
__version__ = "2.0.0-dev4"

View File

@ -1,7 +1,3 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# vim: ai ts=4 sts=4 et sw=4 nu
"""
Main zimit run script
This script validates arguments with warc2zim, checks permissions
@ -11,6 +7,7 @@ and then calls the Node based driver
import atexit
import itertools
import json
import logging
import shutil
import signal
import subprocess
@ -23,19 +20,24 @@ from pathlib import Path
import inotify
import inotify.adapters
import requests
from tld import get_fld
from warc2zim.main import main as warc2zim
from zimscraperlib.logging import getLogger
from zimscraperlib.uri import rebuild_uri
DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15"
from zimit.__about__ import __version__
EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
EXIT_CODE_CRAWLER_LIMIT_HIT = 11
NORMAL_WARC2ZIM_EXIT_CODE = 100
logger = getLogger(name="zimit", level=logging.INFO)
class ProgressFileWatcher:
def __init__(self, output_dir, stats_path):
def __init__(self, output_dir: Path, stats_path: Path):
self.crawl_path = output_dir / "crawl.json"
self.warc2zim_path = output_dir / "warc2zim.json"
self.stats_path = Path(stats_path)
self.stats_path = stats_path
if not self.stats_path.is_absolute():
self.stats_path = output_dir / self.stats_path
@ -46,6 +48,8 @@ class ProgressFileWatcher:
self.process = None
def stop(self):
if not self.process:
return
self.process.join(0.1)
self.process.terminate()
@ -58,10 +62,10 @@ class ProgressFileWatcher:
self.process.start()
@staticmethod
def inotify_watcher(crawl_fpath, warc2zim_fpath, output_fpath):
def inotify_watcher(crawl_fpath: str, warc2zim_fpath: str, output_fpath: str):
ino = inotify.adapters.Inotify()
ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY)
ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY)
ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY) # pyright: ignore
ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY) # pyright: ignore
class Limit:
def __init__(self):
@ -97,15 +101,15 @@ class ProgressFileWatcher:
"limit": limit.as_dict,
}
for _, _, fpath, _ in ino.event_gen(yield_nones=False):
for _, _, fpath, _ in ino.event_gen(yield_nones=False): # pyright: ignore
func = {crawl_fpath: crawl_conv, warc2zim_fpath: warc2zim_conv}.get(fpath)
if not func:
continue
# open input and output separatly as to not clear output on error
with open(fpath, "r") as ifh:
with open(fpath) as ifh:
try:
out = func(json.load(ifh), limit)
except Exception: # nosec
except Exception: # nosec # noqa: S112
# simply ignore progress update should an error arise
# might be malformed input for instance
continue
@ -115,7 +119,7 @@ class ProgressFileWatcher:
json.dump(out, ofh)
def zimit(args=None):
def run(raw_args):
wait_until_options = ["load", "domcontentloaded", "networkidle"]
wait_until_all = wait_until_options + [
f"{a},{b}" for a, b in itertools.combinations(wait_until_options, 2)
@ -131,7 +135,7 @@ def zimit(args=None):
parser.add_argument(
"--urlFile",
help="If set, read a list of seed urls, " "one per line, from the specified",
help="If set, read a list of seed urls, one per line, from the specified",
)
parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers")
@ -205,7 +209,8 @@ def zimit(args=None):
parser.add_argument(
"--lang",
help="if set, sets the language used by the browser, should be ISO 639 language[-country] code",
help="if set, sets the language used by the browser, should be ISO 639 "
"language[-country] code",
)
parser.add_argument(
@ -220,12 +225,21 @@ def zimit(args=None):
help="Emulate mobile device by name from "
"https://github.com/puppeteer/puppeteer/blob/"
"main/packages/puppeteer-core/src/common/Device.ts",
default="Pixel 2",
)
parser.add_argument(
"--noMobileDevice",
help="Do not emulate a mobile device (use at your own risk, behavior is"
"uncertain)",
action="store_true",
default=False,
)
parser.add_argument(
"--userAgent",
help="Override default user-agent with specified value ; --userAgentSuffix is still applied",
default=DEFAULT_USER_AGENT,
help="Override default user-agent with specified value ; --userAgentSuffix and "
"--adminEmail have no effect when this is set",
)
parser.add_argument(
@ -333,7 +347,38 @@ def zimit(args=None):
"to configure the crawling behaviour if not set via argument.",
)
zimit_args, warc2zim_args = parser.parse_known_args(args)
parser.add_argument(
"--version",
help="Display scraper version and exit",
action="version",
version=f"Zimit {__version__}",
)
parser.add_argument(
"--logging",
help="Crawler logging configuration",
)
zimit_args, warc2zim_args = parser.parse_known_args(raw_args)
logger.info("Checking browsertrix-crawler version")
crawl_version_cmd = ["crawl", "--version"]
try:
crawl = subprocess.run(
crawl_version_cmd, check=True, capture_output=True, text=True
)
except Exception:
logger.error("Failed to get Browsertrix crawler version")
raise
crawler_version = crawl.stdout.strip()
logger.info(f"Browsertrix crawler: version {crawler_version}")
# pass a scraper suffix to warc2zim so that both zimit, warc2zim and crawler
# versions are associated with the ZIM
warc2zim_args.append("--scraper-suffix")
warc2zim_args.append(
f" + zimit {__version__} + Browsertrix crawler {crawler_version}"
)
# pass url and output to warc2zim also
if zimit_args.output:
@ -342,14 +387,12 @@ def zimit(args=None):
url = zimit_args.url
user_agent = zimit_args.userAgent
if zimit_args.userAgentSuffix:
user_agent += f" {zimit_args.userAgentSuffix}"
user_agent_suffix = zimit_args.userAgentSuffix
if zimit_args.adminEmail:
user_agent += f" {zimit_args.adminEmail}"
user_agent_suffix += f" {zimit_args.adminEmail}"
if url:
url = check_url(url, user_agent, zimit_args.scopeType)
url = get_cleaned_url(url)
warc2zim_args.append("--url")
warc2zim_args.append(url)
@ -372,13 +415,13 @@ def zimit(args=None):
warc2zim_args.append("--lang")
warc2zim_args.append(zimit_args.zim_lang)
print("----------")
print("Testing warc2zim args")
print("Running: warc2zim " + " ".join(warc2zim_args), flush=True)
logger.info("----------")
logger.info("Testing warc2zim args")
logger.info("Running: warc2zim " + " ".join(warc2zim_args))
res = warc2zim(warc2zim_args)
if res != 100:
print("Exiting, invalid warc2zim params")
return 2
if res != NORMAL_WARC2ZIM_EXIT_CODE:
logger.info("Exiting, invalid warc2zim params")
return EXIT_CODE_WARC2ZIM_CHECK_FAILED
# make temp dir for this crawl
if zimit_args.build:
@ -389,9 +432,9 @@ def zimit(args=None):
if not zimit_args.keep:
def cleanup():
print("")
print("----------")
print(f"Cleanup, removing temp dir: {temp_root_dir}", flush=True)
logger.info("")
logger.info("----------")
logger.info(f"Cleanup, removing temp dir: {temp_root_dir}")
shutil.rmtree(temp_root_dir)
atexit.register(cleanup)
@ -401,8 +444,12 @@ def zimit(args=None):
cmd_args.append("--url")
cmd_args.append(url)
cmd_args.append("--userAgent")
cmd_args.append(user_agent)
cmd_args.append("--userAgentSuffix")
cmd_args.append(user_agent_suffix)
if not zimit_args.noMobileDevice:
cmd_args.append("--mobileDevice")
cmd_args.append(zimit_args.mobileDevice)
cmd_args.append("--cwd")
cmd_args.append(str(temp_root_dir))
@ -412,7 +459,7 @@ def zimit(args=None):
watcher = ProgressFileWatcher(
Path(zimit_args.output), Path(zimit_args.statsFilename)
)
print(f"Writing progress to {watcher.stats_path}")
logger.info(f"Writing progress to {watcher.stats_path}")
# update crawler command
cmd_args.append("--statsFilename")
cmd_args.append(str(watcher.crawl_path))
@ -424,15 +471,16 @@ def zimit(args=None):
cmd_line = " ".join(cmd_args)
print("")
print("----------")
print(
f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}"
logger.info("")
logger.info("----------")
logger.info(
f"Output to tempdir: {temp_root_dir} - "
f"{'will keep' if zimit_args.keep else 'will delete'}"
)
print(f"Running browsertrix-crawler crawl: {cmd_line}", flush=True)
crawl = subprocess.run(cmd_args)
if crawl.returncode == 11:
print("crawl interupted by a limit")
logger.info(f"Running browsertrix-crawler crawl: {cmd_line}")
crawl = subprocess.run(cmd_args, check=False)
if crawl.returncode == EXIT_CODE_CRAWLER_LIMIT_HIT:
logger.info("crawl interupted by a limit")
elif crawl.returncode != 0:
raise subprocess.CalledProcessError(crawl.returncode, cmd_args)
@ -447,65 +495,33 @@ def zimit(args=None):
"Failed to find directory where WARC files have been created"
)
elif len(warc_dirs) > 1:
print("Found many WARC files directories, only last one will be used")
logger.info("Found many WARC files directories, only last one will be used")
for directory in warc_dirs:
print(f"- {directory}")
logger.info(f"- {directory}")
warc_directory = warc_dirs[-1]
print("")
print("----------")
print(f"Processing WARC files in {warc_directory}")
logger.info("")
logger.info("----------")
logger.info(f"Processing WARC files in {warc_directory}")
warc2zim_args.append(str(warc_directory))
num_files = sum(1 for _ in warc_directory.iterdir())
print(f"{num_files} WARC files found", flush=True)
print(f"Calling warc2zim with these args: {warc2zim_args}", flush=True)
logger.info(f"{num_files} WARC files found")
logger.info(f"Calling warc2zim with these args: {warc2zim_args}")
return warc2zim(warc2zim_args)
def check_url(url, user_agent, scope=None):
url = urllib.parse.urlparse(url)
try:
with requests.get(
url.geturl(),
stream=True,
allow_redirects=True,
timeout=(12.2, 27),
headers={"User-Agent": user_agent},
) as resp:
resp.raise_for_status()
except requests.exceptions.RequestException as exc:
print(f"failed to connect to {url.geturl()}: {exc}", flush=True)
raise SystemExit(1)
actual_url = urllib.parse.urlparse(resp.url)
def get_cleaned_url(url: str):
parsed_url = urllib.parse.urlparse(url)
# remove explicit port in URI for default-for-scheme as browsers does it
if actual_url.scheme == "https" and actual_url.port == 443:
actual_url = rebuild_uri(actual_url, port="")
if actual_url.scheme == "http" and actual_url.port == 80:
actual_url = rebuild_uri(actual_url, port="")
if parsed_url.scheme == "https" and parsed_url.port == 443: # noqa: PLR2004
parsed_url = rebuild_uri(parsed_url, port="")
if parsed_url.scheme == "http" and parsed_url.port == 80: # noqa: PLR2004
parsed_url = rebuild_uri(parsed_url, port="")
if actual_url.geturl() != url.geturl():
if scope in (None, "any"):
return actual_url.geturl()
print(
"[WARN] Your URL ({0}) redirects to {1} which {2} on same "
"first-level domain. Depending on your scopeType ({3}), "
"your homepage might be out-of-scope. Please check!".format(
url.geturl(),
actual_url.geturl(),
"is"
if get_fld(url.geturl()) == get_fld(actual_url.geturl())
else "is not",
scope,
)
)
return actual_url.geturl()
return url.geturl()
return parsed_url.geturl()
def get_node_cmd_line(args):
@ -527,7 +543,7 @@ def get_node_cmd_line(args):
"collection",
"allowHashUrls",
"lang",
"mobileDevice",
"userAgent",
"useSitemap",
"behaviors",
"behaviorTimeout",
@ -539,9 +555,10 @@ def get_node_cmd_line(args):
"healthCheckPort",
"overwrite",
"config",
"logging",
]:
value = getattr(args, arg)
if value == None or (isinstance(value, bool) and value == False):
if value is None or (isinstance(value, bool) and value is False):
continue
node_cmd.append("--" + arg)
if not isinstance(value, bool):
@ -550,17 +567,22 @@ def get_node_cmd_line(args):
return node_cmd
def sigint_handler(*args):
print("")
print("")
print("SIGINT/SIGTERM received, stopping zimit")
print("")
print("", flush=True)
def sigint_handler(*args): # noqa: ARG001
logger.info("")
logger.info("")
logger.info("SIGINT/SIGTERM received, stopping zimit")
logger.info("")
logger.info("")
sys.exit(3)
def zimit():
run(sys.argv[1:])
signal.signal(signal.SIGINT, sigint_handler)
signal.signal(signal.SIGTERM, sigint_handler)
if __name__ == "__main__":
zimit()

109
tasks.py Normal file
View File

@ -0,0 +1,109 @@
# pyright: strict, reportUntypedFunctionDecorator=false
import os
from invoke.context import Context
from invoke.tasks import task # pyright: ignore [reportUnknownVariableType]
use_pty = not os.getenv("CI", "")
@task(optional=["args"], help={"args": "pytest additional arguments"})
def test(ctx: Context, args: str = ""):
"""run tests (without coverage)"""
ctx.run(f"pytest {args}", pty=use_pty)
@task(optional=["args"], help={"args": "pytest additional arguments"})
def test_cov(ctx: Context, args: str = ""):
"""run test vith coverage"""
ctx.run(f"coverage run -m pytest {args}", pty=use_pty)
@task(optional=["html"], help={"html": "flag to export html report"})
def report_cov(ctx: Context, *, html: bool = False):
"""report coverage"""
ctx.run("coverage combine", warn=True, pty=use_pty)
ctx.run("coverage report --show-missing", pty=use_pty)
if html:
ctx.run("coverage html", pty=use_pty)
@task(
optional=["args", "html"],
help={
"args": "pytest additional arguments",
"html": "flag to export html report",
},
)
def coverage(ctx: Context, args: str = "", *, html: bool = False):
"""run tests and report coverage"""
test_cov(ctx, args=args)
report_cov(ctx, html=html)
@task(optional=["args"], help={"args": "black additional arguments"})
def lint_black(ctx: Context, args: str = "."):
args = args or "." # needed for hatch script
ctx.run("black --version", pty=use_pty)
ctx.run(f"black --check --diff {args}", pty=use_pty)
@task(optional=["args"], help={"args": "ruff additional arguments"})
def lint_ruff(ctx: Context, args: str = "."):
args = args or "." # needed for hatch script
ctx.run("ruff --version", pty=use_pty)
ctx.run(f"ruff check {args}", pty=use_pty)
@task(
optional=["args"],
help={
"args": "linting tools (black, ruff) additional arguments, typically a path",
},
)
def lintall(ctx: Context, args: str = "."):
"""Check linting"""
args = args or "." # needed for hatch script
lint_black(ctx, args)
lint_ruff(ctx, args)
@task(optional=["args"], help={"args": "check tools (pyright) additional arguments"})
def check_pyright(ctx: Context, args: str = ""):
"""check static types with pyright"""
ctx.run("pyright --version")
ctx.run(f"pyright {args}", pty=use_pty)
@task(optional=["args"], help={"args": "check tools (pyright) additional arguments"})
def checkall(ctx: Context, args: str = ""):
"""check static types"""
check_pyright(ctx, args)
@task(optional=["args"], help={"args": "black additional arguments"})
def fix_black(ctx: Context, args: str = "."):
"""fix black formatting"""
args = args or "." # needed for hatch script
ctx.run(f"black {args}", pty=use_pty)
@task(optional=["args"], help={"args": "ruff additional arguments"})
def fix_ruff(ctx: Context, args: str = "."):
"""fix all ruff rules"""
args = args or "." # needed for hatch script
ctx.run(f"ruff check --fix {args}", pty=use_pty)
@task(
optional=["args"],
help={
"args": "linting tools (black, ruff) additional arguments, typically a path",
},
)
def fixall(ctx: Context, args: str = "."):
"""Fix everything automatically"""
args = args or "." # needed for hatch script
fix_black(ctx, args)
fix_ruff(ctx, args)
lintall(ctx, args)

View File

@ -0,0 +1 @@
These are integration tests, meant to be ran inside the CI (because we need to first perform a zimit run on a given website and then check its output)

View File

@ -1,14 +1,9 @@
import os
import glob
import json
import os
import libzim.reader
from warcio import ArchiveIterator
def get_zim_article(zimfile, path):
zim_fh = libzim.reader.Archive(zimfile)
return zim_fh.get_entry_by_path(path).get_item().content.tobytes()
from zimscraperlib.zim import Archive
def test_is_file():
@ -20,20 +15,34 @@ def test_zim_main_page():
"""Main page specified, http://isago.rskg.org/, was a redirect to https
Ensure main page is the redirected page"""
assert b'"https://isago.rskg.org/"' in get_zim_article(
"/output/isago.zim", "A/index.html"
)
main_entry = Archive("/output/isago.zim").main_entry
assert main_entry.is_redirect
assert main_entry.get_redirect_entry().path == "isago.rskg.org/"
def test_zim_scraper():
"""Main page specified, http://isago.rskg.org/, was a redirect to https
Ensure main page is the redirected page"""
zim_fh = Archive("/output/isago.zim")
scraper = zim_fh.get_text_metadata("Scraper")
assert "zimit " in scraper
assert "warc2zim " in scraper
assert "Browsertrix crawler " in scraper
def test_user_agent():
"""Test that mobile user agent was used in WARC request records with custom Zimit and email suffix"""
"""Test that mobile user agent was used
Check is done in WARC request records with custom Zimit and email suffix
"""
found = False
for warc in glob.glob("/output/.tmp*/collections/crawl-*/archive/*.warc.gz"):
with open(warc, "rb") as fh:
for record in ArchiveIterator(fh):
if record.rec_type == "request":
print(record.http_headers)
print(record.http_headers) # noqa: T201
ua = record.http_headers.get_header("User-Agent")
if ua:
assert "Mozilla" in ua
@ -56,12 +65,12 @@ def test_stats_output():
}
with open("/output/warc2zim.json") as fh:
assert json.loads(fh.read()) == {
"written": 8,
"total": 8,
"written": 7,
"total": 7,
}
with open("/output/stats.json") as fh:
assert json.loads(fh.read()) == {
"done": 8,
"total": 8,
"done": 7,
"total": 7,
"limit": {"max": 0, "hit": False},
}

6
tests/test_dummy.py Normal file
View File

@ -0,0 +1,6 @@
from zimit.zimit import NORMAL_WARC2ZIM_EXIT_CODE
# dummy test, just to have coverage report done
def test_something_exists():
assert NORMAL_WARC2ZIM_EXIT_CODE