mirror of
https://github.com/openzim/zimit.git
synced 2025-09-24 04:30:11 -04:00
Merge branch 'zimit2'
This commit is contained in:
commit
ce49a5d4e9
@ -1,2 +0,0 @@
|
|||||||
output/
|
|
||||||
node_modules/
|
|
@ -1,26 +1,20 @@
|
|||||||
name: Docker
|
name: Publish released version
|
||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
release:
|
||||||
branches:
|
types: [published]
|
||||||
- main
|
|
||||||
tags:
|
|
||||||
- v*
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build-and-push:
|
publish:
|
||||||
name: Deploy Docker Image
|
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-22.04
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Retrieve source code
|
- uses: actions/checkout@v3
|
||||||
uses: actions/checkout@v3
|
|
||||||
|
|
||||||
- name: Build and push
|
- name: Build and push Docker image
|
||||||
uses: openzim/docker-publish-action@v10
|
uses: openzim/docker-publish-action@v10
|
||||||
with:
|
with:
|
||||||
image-name: openzim/zimit
|
image-name: openzim/zimit
|
||||||
on-master: dev
|
|
||||||
tag-pattern: /^v([0-9.]+)$/
|
tag-pattern: /^v([0-9.]+)$/
|
||||||
latest-on-tag: true
|
latest-on-tag: true
|
||||||
restrict-to: openzim/zimit
|
restrict-to: openzim/zimit
|
30
.github/workflows/PublishDockerDevImage.yaml
vendored
Normal file
30
.github/workflows/PublishDockerDevImage.yaml
vendored
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
name: Publish Docker dev image
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
publish:
|
||||||
|
runs-on: ubuntu-22.04
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Build and push Docker image
|
||||||
|
uses: openzim/docker-publish-action@v10
|
||||||
|
with:
|
||||||
|
image-name: openzim/zimit
|
||||||
|
manual-tag: dev
|
||||||
|
latest-on-tag: false
|
||||||
|
restrict-to: openzim/zimit
|
||||||
|
registries: ghcr.io
|
||||||
|
credentials:
|
||||||
|
GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
|
||||||
|
GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
|
||||||
|
repo_description: auto
|
||||||
|
repo_overview: auto
|
||||||
|
platforms: |
|
||||||
|
linux/amd64
|
||||||
|
linux/arm64
|
30
.github/workflows/PublishDockerZimit2Image.yaml
vendored
Normal file
30
.github/workflows/PublishDockerZimit2Image.yaml
vendored
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
name: Publish Docker zimit2 image
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- zimit2
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
publish:
|
||||||
|
runs-on: ubuntu-22.04
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Build and push Docker image
|
||||||
|
uses: openzim/docker-publish-action@v10
|
||||||
|
with:
|
||||||
|
image-name: openzim/zimit
|
||||||
|
manual-tag: zimit2
|
||||||
|
latest-on-tag: false
|
||||||
|
restrict-to: openzim/zimit
|
||||||
|
registries: ghcr.io
|
||||||
|
credentials:
|
||||||
|
GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
|
||||||
|
GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
|
||||||
|
repo_description: auto
|
||||||
|
repo_overview: auto
|
||||||
|
platforms: |
|
||||||
|
linux/amd64
|
||||||
|
linux/arm64
|
34
.github/workflows/QA.yaml
vendored
Normal file
34
.github/workflows/QA.yaml
vendored
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
name: QA
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
check-qa:
|
||||||
|
runs-on: ubuntu-22.04
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version-file: pyproject.toml
|
||||||
|
architecture: x64
|
||||||
|
|
||||||
|
- name: Install dependencies (and project)
|
||||||
|
run: |
|
||||||
|
pip install -U pip
|
||||||
|
pip install -e .[lint,scripts,test,check]
|
||||||
|
|
||||||
|
- name: Check black formatting
|
||||||
|
run: inv lint-black
|
||||||
|
|
||||||
|
- name: Check ruff
|
||||||
|
run: inv lint-ruff
|
||||||
|
|
||||||
|
- name: Check pyright
|
||||||
|
run: inv check-pyright
|
66
.github/workflows/Tests.yaml
vendored
Normal file
66
.github/workflows/Tests.yaml
vendored
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
name: Tests
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
run-tests:
|
||||||
|
runs-on: ubuntu-22.04
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version-file: pyproject.toml
|
||||||
|
architecture: x64
|
||||||
|
|
||||||
|
- name: Install dependencies (and project)
|
||||||
|
run: |
|
||||||
|
pip install -U pip
|
||||||
|
pip install -e .[test,scripts]
|
||||||
|
|
||||||
|
- name: Run the tests
|
||||||
|
run: inv coverage --args "-vvv"
|
||||||
|
|
||||||
|
- name: Upload coverage report to codecov
|
||||||
|
uses: codecov/codecov-action@v3
|
||||||
|
with:
|
||||||
|
token: ${{ secrets.CODECOV_TOKEN }}
|
||||||
|
|
||||||
|
build_python:
|
||||||
|
runs-on: ubuntu-22.04
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version-file: pyproject.toml
|
||||||
|
architecture: x64
|
||||||
|
|
||||||
|
- name: Ensure we can build Python targets
|
||||||
|
run: |
|
||||||
|
pip install -U pip build
|
||||||
|
python3 -m build --sdist --wheel
|
||||||
|
|
||||||
|
# this job replaces the standard "build_docker" job since it builds the docker image
|
||||||
|
run-integration-tests:
|
||||||
|
runs-on: ubuntu-22.04
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: build image
|
||||||
|
run: docker build -t zimit .
|
||||||
|
|
||||||
|
- name: run crawl
|
||||||
|
run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
|
||||||
|
|
||||||
|
- name: run integration test suite
|
||||||
|
run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py"
|
20
.github/workflows/ci.yaml
vendored
20
.github/workflows/ci.yaml
vendored
@ -1,20 +0,0 @@
|
|||||||
name: CI
|
|
||||||
|
|
||||||
on: push
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
integration-tests:
|
|
||||||
runs-on: ubuntu-22.04
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: checkout
|
|
||||||
uses: actions/checkout@v3
|
|
||||||
|
|
||||||
- name: build image
|
|
||||||
run: docker build -t zimit .
|
|
||||||
|
|
||||||
- name: run crawl
|
|
||||||
run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
|
|
||||||
|
|
||||||
- name: run integration test suite
|
|
||||||
run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v ./integration.py"
|
|
27
.pre-commit-config.yaml
Normal file
27
.pre-commit-config.yaml
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
# See https://pre-commit.com for more information
|
||||||
|
# See https://pre-commit.com/hooks.html for more hooks
|
||||||
|
repos:
|
||||||
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
|
rev: v4.4.0
|
||||||
|
hooks:
|
||||||
|
- id: trailing-whitespace
|
||||||
|
- id: end-of-file-fixer
|
||||||
|
- repo: https://github.com/psf/black
|
||||||
|
rev: "24.2.0"
|
||||||
|
hooks:
|
||||||
|
- id: black
|
||||||
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
|
rev: v0.3.0
|
||||||
|
hooks:
|
||||||
|
- id: ruff
|
||||||
|
- repo: https://github.com/RobertCraigie/pyright-python
|
||||||
|
rev: v1.1.352
|
||||||
|
hooks:
|
||||||
|
- id: pyright
|
||||||
|
name: pyright (system)
|
||||||
|
description: 'pyright static type checker'
|
||||||
|
entry: pyright
|
||||||
|
language: system
|
||||||
|
'types_or': [python, pyi]
|
||||||
|
require_serial: true
|
||||||
|
minimum_pre_commit_version: '2.9.2'
|
22
CHANGELOG.md
22
CHANGELOG.md
@ -7,6 +7,28 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
|
### Added
|
||||||
|
|
||||||
|
- New `--version` flag to display Zimit version
|
||||||
|
- New `--logging` flag to adjust Browsertrix Crawler logging (#273)
|
||||||
|
- Use new `--scraper-suffix` flag of warc2zim to enhance ZIM "Scraper" metadata (#275)
|
||||||
|
- New `--noMobileDevice` CLI argument
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
|
||||||
|
- Use `warc2zim` version 2, which works without Service Worker anymore
|
||||||
|
- Using `warc2zim2` warc2zim ⚠️ change before releasing!
|
||||||
|
- Build temporary `zimit2` Docker image for testing ⚠️ remove before releasing!
|
||||||
|
- Adopt Python bootstrap conventions
|
||||||
|
- Removed handling of redirects by zimit, they are handled by browsertrix crawler and detected properly by warc2zim
|
||||||
|
- Upgrade to Python 3.12 + upgrade dependencies
|
||||||
|
- `--userAgent` CLI argument overrides again the `--userAgentSuffix` and `--adminEmail` values
|
||||||
|
- `--userAgent` CLI arguement is not mandatory anymore
|
||||||
|
- Upgraded Browsertrix Crawler to 1.0.3
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
|
||||||
|
- Fix support for Youtube videos (#291)
|
||||||
|
|
||||||
## [1.6.3] - 2024-01-18
|
## [1.6.3] - 2024-01-18
|
||||||
|
|
||||||
|
50
Dockerfile
50
Dockerfile
@ -1,36 +1,48 @@
|
|||||||
FROM webrecorder/browsertrix-crawler:0.12.4
|
FROM webrecorder/browsertrix-crawler:1.1.1
|
||||||
LABEL org.opencontainers.image.source https://github.com/openzim/zimit
|
LABEL org.opencontainers.image.source https://github.com/openzim/zimit
|
||||||
|
|
||||||
|
# add deadsnakes ppa for Python 3.12 on Ubuntu Jammy
|
||||||
|
RUN add-apt-repository ppa:deadsnakes/ppa -y
|
||||||
|
|
||||||
RUN apt-get update \
|
RUN apt-get update \
|
||||||
&& apt-get install -qqy --no-install-recommends \
|
&& apt-get install -qqy --no-install-recommends \
|
||||||
libmagic1 \
|
libmagic1 \
|
||||||
python3.10-venv \
|
python3.12-venv \
|
||||||
&& rm -rf /var/lib/apt/lists/* \
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
# python setup (in venv not to conflict with browsertrix)
|
# python setup (in venv not to conflict with browsertrix)
|
||||||
&& python3 -m venv /app/zimit \
|
&& python3.12 -m venv /app/zimit \
|
||||||
&& /app/zimit/bin/python -m pip install --no-cache-dir 'requests==2.31.0' 'inotify==0.2.10' 'tld==0.13' \
|
|
||||||
'warc2zim==1.5.5' \
|
|
||||||
# placeholder (default output location)
|
# placeholder (default output location)
|
||||||
&& mkdir -p /output \
|
&& mkdir -p /output \
|
||||||
# disable chrome upgrade
|
# disable chrome upgrade
|
||||||
&& printf "repo_add_once=\"false\"\nrepo_reenable_on_distupgrade=\"false\"\n" > /etc/default/google-chrome \
|
&& printf "repo_add_once=\"false\"\nrepo_reenable_on_distupgrade=\"false\"\n" > /etc/default/google-chrome \
|
||||||
# download list of bad domains to filter-out. intentionnaly ran post-install \
|
# download list of bad domains to filter-out. intentionnaly ran post-install \
|
||||||
# so it's not cached in earlier layers (url stays same but content updated) \
|
# so it's not cached in earlier layers (url stays same but content updated) \
|
||||||
mkdir -p /tmp/ads && cd /tmp/ads && \
|
&& mkdir -p /tmp/ads \
|
||||||
curl -L -O https://hosts.anudeep.me/mirror/adservers.txt && \
|
&& cd /tmp/ads \
|
||||||
curl -L -O https://hosts.anudeep.me/mirror/CoinMiner.txt && \
|
&& curl -L -O https://hosts.anudeep.me/mirror/adservers.txt \
|
||||||
curl -L -O https://hosts.anudeep.me/mirror/facebook.txt && \
|
&& curl -L -O https://hosts.anudeep.me/mirror/CoinMiner.txt \
|
||||||
cat ./*.txt > /etc/blocklist.txt \
|
&& curl -L -O https://hosts.anudeep.me/mirror/facebook.txt \
|
||||||
|
&& cat ./*.txt > /etc/blocklist.txt \
|
||||||
&& rm ./*.txt \
|
&& rm ./*.txt \
|
||||||
&& printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh && \
|
&& printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh \
|
||||||
chmod +x /usr/local/bin/entrypoint.sh
|
&& chmod +x /usr/local/bin/entrypoint.sh
|
||||||
|
|
||||||
WORKDIR /app
|
# Copy pyproject.toml and its dependencies
|
||||||
ADD zimit.py /app/
|
COPY pyproject.toml README.md /src/
|
||||||
# fix shebang on zimit to use in-venv python
|
COPY src/zimit/__about__.py /src/src/zimit/__about__.py
|
||||||
RUN sed -i.bak "1 s/.*/#!\/app\/zimit\/bin\/python3/" /app/zimit.py \
|
|
||||||
&& ln -s /app/zimit.py /usr/bin/zimit \
|
# Install Python dependencies
|
||||||
&& chmod +x /usr/bin/zimit
|
RUN . /app/zimit/bin/activate && python -m pip install --no-cache-dir /src
|
||||||
|
|
||||||
|
# Copy code + associated artifacts
|
||||||
|
COPY src /src/src
|
||||||
|
COPY *.md /src/
|
||||||
|
|
||||||
|
# Install + cleanup
|
||||||
|
RUN . /app/zimit/bin/activate && python -m pip install --no-cache-dir /src \
|
||||||
|
&& ln -s /app/zimit/bin/zimit /usr/bin/zimit \
|
||||||
|
&& chmod +x /usr/bin/zimit \
|
||||||
|
&& rm -rf /src
|
||||||
|
|
||||||
ENTRYPOINT ["entrypoint.sh"]
|
ENTRYPOINT ["entrypoint.sh"]
|
||||||
CMD ["zimit"]
|
CMD ["zimit", "--help"]
|
||||||
|
10
README.md
10
README.md
@ -3,12 +3,9 @@ Zimit
|
|||||||
|
|
||||||
Zimit is a scraper allowing to create ZIM file from any Web site.
|
Zimit is a scraper allowing to create ZIM file from any Web site.
|
||||||
|
|
||||||
[](https://ghcr.io/openzim/zimit)
|
|
||||||
[](https://github.com/openzim/zimit/actions?query=branch%3Amain)
|
|
||||||
[](https://www.codefactor.io/repository/github/openzim/zimit)
|
[](https://www.codefactor.io/repository/github/openzim/zimit)
|
||||||
[](https://www.gnu.org/licenses/gpl-3.0)
|
[](https://www.gnu.org/licenses/gpl-3.0)
|
||||||
|
[](https://ghcr.io/openzim/zimit)
|
||||||
⚠️ **Important**: this tool uses [warc2zim](https://github.com/openzim/warc2zim) to create Zim files and thus require the Zim reader to support *Service Workers*. At the time of `zimit:1.0`, that's mostly kiwix-android and kiwix-serve. Note that service workers have protocol restrictions as well so you'll need to run it either from `localhost` or over HTTPS.
|
|
||||||
|
|
||||||
Technical background
|
Technical background
|
||||||
--------------------
|
--------------------
|
||||||
@ -68,7 +65,10 @@ default and prints the crawl status to the Docker log.
|
|||||||
Nota bene
|
Nota bene
|
||||||
---------
|
---------
|
||||||
|
|
||||||
A first version of a generic HTTP scraper was created in 2016 during
|
While Zimit 1.x relied on a Service Worker to display the ZIM content, this is not anymore the case
|
||||||
|
since Zimit 2.x which does not have any special requirements anymore.
|
||||||
|
|
||||||
|
It should also be noted that a first version of a generic HTTP scraper was created in 2016 during
|
||||||
the [Wikimania Esino Lario
|
the [Wikimania Esino Lario
|
||||||
Hackathon](https://wikimania2016.wikimedia.org/wiki/Programme/Kiwix-dedicated_Hackathon).
|
Hackathon](https://wikimania2016.wikimedia.org/wiki/Programme/Kiwix-dedicated_Hackathon).
|
||||||
|
|
||||||
|
224
pyproject.toml
Normal file
224
pyproject.toml
Normal file
@ -0,0 +1,224 @@
|
|||||||
|
[build-system]
|
||||||
|
requires = ["hatchling", "hatch-openzim==0.2.0"]
|
||||||
|
build-backend = "hatchling.build"
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "zimit"
|
||||||
|
requires-python = ">=3.12,<3.13"
|
||||||
|
description = "Make ZIM file from any website through crawling"
|
||||||
|
readme = "README.md"
|
||||||
|
dependencies = [
|
||||||
|
"requests==2.31.0",
|
||||||
|
"inotify==0.2.10",
|
||||||
|
"tld==0.13",
|
||||||
|
"warc2zim @ git+https://github.com/openzim/warc2zim@warc2zim2",
|
||||||
|
]
|
||||||
|
dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
|
||||||
|
|
||||||
|
[tool.hatch.metadata.hooks.openzim-metadata]
|
||||||
|
kind = "scraper"
|
||||||
|
|
||||||
|
[tool.hatch.metadata]
|
||||||
|
allow-direct-references = true # to be removed once we use a released warc2zim version
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
scripts = [
|
||||||
|
"invoke==2.2.0",
|
||||||
|
]
|
||||||
|
lint = [
|
||||||
|
"black==24.2.0",
|
||||||
|
"ruff==0.3.0",
|
||||||
|
]
|
||||||
|
check = [
|
||||||
|
"pyright==1.1.352",
|
||||||
|
]
|
||||||
|
test = [
|
||||||
|
"pytest==8.0.2",
|
||||||
|
"coverage==7.4.3",
|
||||||
|
]
|
||||||
|
dev = [
|
||||||
|
"pre-commit==3.6.2",
|
||||||
|
"debugpy==1.8.1",
|
||||||
|
"zimit[scripts]",
|
||||||
|
"zimit[lint]",
|
||||||
|
"zimit[test]",
|
||||||
|
"zimit[check]",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
zimit = "zimit:zimit.zimit"
|
||||||
|
|
||||||
|
[tool.hatch.version]
|
||||||
|
path = "src/zimit/__about__.py"
|
||||||
|
|
||||||
|
[tool.hatch.build]
|
||||||
|
exclude = [
|
||||||
|
"/.github",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.hatch.build.targets.wheel]
|
||||||
|
packages = ["src/zimit"]
|
||||||
|
|
||||||
|
[tool.hatch.envs.default]
|
||||||
|
features = ["dev"]
|
||||||
|
|
||||||
|
[tool.hatch.envs.test]
|
||||||
|
features = ["scripts", "test"]
|
||||||
|
|
||||||
|
[tool.hatch.envs.test.scripts]
|
||||||
|
run = "inv test --args '{args}'"
|
||||||
|
run-cov = "inv test-cov --args '{args}'"
|
||||||
|
report-cov = "inv report-cov"
|
||||||
|
coverage = "inv coverage --args '{args}'"
|
||||||
|
html = "inv coverage --html --args '{args}'"
|
||||||
|
|
||||||
|
[tool.hatch.envs.lint]
|
||||||
|
template = "lint"
|
||||||
|
skip-install = false
|
||||||
|
features = ["scripts", "lint"]
|
||||||
|
|
||||||
|
[tool.hatch.envs.lint.scripts]
|
||||||
|
black = "inv lint-black --args '{args}'"
|
||||||
|
ruff = "inv lint-ruff --args '{args}'"
|
||||||
|
all = "inv lintall --args '{args}'"
|
||||||
|
fix-black = "inv fix-black --args '{args}'"
|
||||||
|
fix-ruff = "inv fix-ruff --args '{args}'"
|
||||||
|
fixall = "inv fixall --args '{args}'"
|
||||||
|
|
||||||
|
[tool.hatch.envs.check]
|
||||||
|
features = ["scripts", "check"]
|
||||||
|
|
||||||
|
[tool.hatch.envs.check.scripts]
|
||||||
|
pyright = "inv check-pyright --args '{args}'"
|
||||||
|
all = "inv checkall --args '{args}'"
|
||||||
|
|
||||||
|
[tool.black]
|
||||||
|
line-length = 88
|
||||||
|
target-version = ['py312']
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
target-version = "py312"
|
||||||
|
line-length = 88
|
||||||
|
src = ["src"]
|
||||||
|
|
||||||
|
[tool.ruff.lint]
|
||||||
|
select = [
|
||||||
|
"A", # flake8-builtins
|
||||||
|
# "ANN", # flake8-annotations
|
||||||
|
"ARG", # flake8-unused-arguments
|
||||||
|
# "ASYNC", # flake8-async
|
||||||
|
"B", # flake8-bugbear
|
||||||
|
# "BLE", # flake8-blind-except
|
||||||
|
"C4", # flake8-comprehensions
|
||||||
|
"C90", # mccabe
|
||||||
|
# "COM", # flake8-commas
|
||||||
|
# "D", # pydocstyle
|
||||||
|
# "DJ", # flake8-django
|
||||||
|
"DTZ", # flake8-datetimez
|
||||||
|
"E", # pycodestyle (default)
|
||||||
|
"EM", # flake8-errmsg
|
||||||
|
# "ERA", # eradicate
|
||||||
|
# "EXE", # flake8-executable
|
||||||
|
"F", # Pyflakes (default)
|
||||||
|
# "FA", # flake8-future-annotations
|
||||||
|
"FBT", # flake8-boolean-trap
|
||||||
|
# "FLY", # flynt
|
||||||
|
# "G", # flake8-logging-format
|
||||||
|
"I", # isort
|
||||||
|
"ICN", # flake8-import-conventions
|
||||||
|
# "INP", # flake8-no-pep420
|
||||||
|
# "INT", # flake8-gettext
|
||||||
|
"ISC", # flake8-implicit-str-concat
|
||||||
|
"N", # pep8-naming
|
||||||
|
# "NPY", # NumPy-specific rules
|
||||||
|
# "PD", # pandas-vet
|
||||||
|
# "PGH", # pygrep-hooks
|
||||||
|
# "PIE", # flake8-pie
|
||||||
|
# "PL", # Pylint
|
||||||
|
"PLC", # Pylint: Convention
|
||||||
|
"PLE", # Pylint: Error
|
||||||
|
"PLR", # Pylint: Refactor
|
||||||
|
"PLW", # Pylint: Warning
|
||||||
|
# "PT", # flake8-pytest-style
|
||||||
|
# "PTH", # flake8-use-pathlib
|
||||||
|
# "PYI", # flake8-pyi
|
||||||
|
"Q", # flake8-quotes
|
||||||
|
# "RET", # flake8-return
|
||||||
|
# "RSE", # flake8-raise
|
||||||
|
"RUF", # Ruff-specific rules
|
||||||
|
"S", # flake8-bandit
|
||||||
|
# "SIM", # flake8-simplify
|
||||||
|
# "SLF", # flake8-self
|
||||||
|
"T10", # flake8-debugger
|
||||||
|
"T20", # flake8-print
|
||||||
|
# "TCH", # flake8-type-checking
|
||||||
|
# "TD", # flake8-todos
|
||||||
|
"TID", # flake8-tidy-imports
|
||||||
|
# "TRY", # tryceratops
|
||||||
|
"UP", # pyupgrade
|
||||||
|
"W", # pycodestyle
|
||||||
|
"YTT", # flake8-2020
|
||||||
|
]
|
||||||
|
ignore = [
|
||||||
|
# Allow non-abstract empty methods in abstract base classes
|
||||||
|
"B027",
|
||||||
|
# Remove flake8-errmsg since we consider they bloat the code and provide limited value
|
||||||
|
"EM",
|
||||||
|
# Allow boolean positional values in function calls, like `dict.get(... True)`
|
||||||
|
"FBT003",
|
||||||
|
# Ignore checks for possible passwords
|
||||||
|
"S105", "S106", "S107",
|
||||||
|
# Ignore warnings on subprocess.run / popen
|
||||||
|
"S603",
|
||||||
|
# Ignore complexity
|
||||||
|
"C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915",
|
||||||
|
]
|
||||||
|
unfixable = [
|
||||||
|
# Don't touch unused imports
|
||||||
|
"F401",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.ruff.lint.isort]
|
||||||
|
known-first-party = ["zimit"]
|
||||||
|
|
||||||
|
[tool.ruff.lint.flake8-bugbear]
|
||||||
|
# add exceptions to B008 for fastapi.
|
||||||
|
extend-immutable-calls = ["fastapi.Depends", "fastapi.Query"]
|
||||||
|
|
||||||
|
[tool.ruff.lint.flake8-tidy-imports]
|
||||||
|
ban-relative-imports = "all"
|
||||||
|
|
||||||
|
[tool.ruff.lint.per-file-ignores]
|
||||||
|
# Tests can use magic values, assertions, and relative imports
|
||||||
|
"tests**/**/*" = ["PLR2004", "S101", "TID252"]
|
||||||
|
|
||||||
|
[tool.pytest.ini_options]
|
||||||
|
minversion = "7.3"
|
||||||
|
testpaths = ["tests"]
|
||||||
|
pythonpath = [".", "src"]
|
||||||
|
|
||||||
|
[tool.coverage.paths]
|
||||||
|
zimit = ["src/zimit"]
|
||||||
|
tests = ["tests"]
|
||||||
|
|
||||||
|
[tool.coverage.run]
|
||||||
|
source_pkgs = ["zimit"]
|
||||||
|
branch = true
|
||||||
|
parallel = true
|
||||||
|
omit = [
|
||||||
|
"src/zimit/__about__.py",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.coverage.report]
|
||||||
|
exclude_lines = [
|
||||||
|
"no cov",
|
||||||
|
"if __name__ == .__main__.:",
|
||||||
|
"if TYPE_CHECKING:",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.pyright]
|
||||||
|
include = ["src", "tests", "tasks.py"]
|
||||||
|
exclude = [".env/**", ".venv/**"]
|
||||||
|
extraPaths = ["src"]
|
||||||
|
pythonVersion = "3.12"
|
||||||
|
typeCheckingMode="basic"
|
1
src/zimit/__about__.py
Normal file
1
src/zimit/__about__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
__version__ = "2.0.0-dev4"
|
@ -1,7 +1,3 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
# vim: ai ts=4 sts=4 et sw=4 nu
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Main zimit run script
|
Main zimit run script
|
||||||
This script validates arguments with warc2zim, checks permissions
|
This script validates arguments with warc2zim, checks permissions
|
||||||
@ -11,6 +7,7 @@ and then calls the Node based driver
|
|||||||
import atexit
|
import atexit
|
||||||
import itertools
|
import itertools
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
import shutil
|
import shutil
|
||||||
import signal
|
import signal
|
||||||
import subprocess
|
import subprocess
|
||||||
@ -23,19 +20,24 @@ from pathlib import Path
|
|||||||
|
|
||||||
import inotify
|
import inotify
|
||||||
import inotify.adapters
|
import inotify.adapters
|
||||||
import requests
|
|
||||||
from tld import get_fld
|
|
||||||
from warc2zim.main import main as warc2zim
|
from warc2zim.main import main as warc2zim
|
||||||
|
from zimscraperlib.logging import getLogger
|
||||||
from zimscraperlib.uri import rebuild_uri
|
from zimscraperlib.uri import rebuild_uri
|
||||||
|
|
||||||
DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15"
|
from zimit.__about__ import __version__
|
||||||
|
|
||||||
|
EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
|
||||||
|
EXIT_CODE_CRAWLER_LIMIT_HIT = 11
|
||||||
|
NORMAL_WARC2ZIM_EXIT_CODE = 100
|
||||||
|
|
||||||
|
logger = getLogger(name="zimit", level=logging.INFO)
|
||||||
|
|
||||||
|
|
||||||
class ProgressFileWatcher:
|
class ProgressFileWatcher:
|
||||||
def __init__(self, output_dir, stats_path):
|
def __init__(self, output_dir: Path, stats_path: Path):
|
||||||
self.crawl_path = output_dir / "crawl.json"
|
self.crawl_path = output_dir / "crawl.json"
|
||||||
self.warc2zim_path = output_dir / "warc2zim.json"
|
self.warc2zim_path = output_dir / "warc2zim.json"
|
||||||
self.stats_path = Path(stats_path)
|
self.stats_path = stats_path
|
||||||
|
|
||||||
if not self.stats_path.is_absolute():
|
if not self.stats_path.is_absolute():
|
||||||
self.stats_path = output_dir / self.stats_path
|
self.stats_path = output_dir / self.stats_path
|
||||||
@ -46,6 +48,8 @@ class ProgressFileWatcher:
|
|||||||
self.process = None
|
self.process = None
|
||||||
|
|
||||||
def stop(self):
|
def stop(self):
|
||||||
|
if not self.process:
|
||||||
|
return
|
||||||
self.process.join(0.1)
|
self.process.join(0.1)
|
||||||
self.process.terminate()
|
self.process.terminate()
|
||||||
|
|
||||||
@ -58,10 +62,10 @@ class ProgressFileWatcher:
|
|||||||
self.process.start()
|
self.process.start()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def inotify_watcher(crawl_fpath, warc2zim_fpath, output_fpath):
|
def inotify_watcher(crawl_fpath: str, warc2zim_fpath: str, output_fpath: str):
|
||||||
ino = inotify.adapters.Inotify()
|
ino = inotify.adapters.Inotify()
|
||||||
ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY)
|
ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY) # pyright: ignore
|
||||||
ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY)
|
ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY) # pyright: ignore
|
||||||
|
|
||||||
class Limit:
|
class Limit:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -97,15 +101,15 @@ class ProgressFileWatcher:
|
|||||||
"limit": limit.as_dict,
|
"limit": limit.as_dict,
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, _, fpath, _ in ino.event_gen(yield_nones=False):
|
for _, _, fpath, _ in ino.event_gen(yield_nones=False): # pyright: ignore
|
||||||
func = {crawl_fpath: crawl_conv, warc2zim_fpath: warc2zim_conv}.get(fpath)
|
func = {crawl_fpath: crawl_conv, warc2zim_fpath: warc2zim_conv}.get(fpath)
|
||||||
if not func:
|
if not func:
|
||||||
continue
|
continue
|
||||||
# open input and output separatly as to not clear output on error
|
# open input and output separatly as to not clear output on error
|
||||||
with open(fpath, "r") as ifh:
|
with open(fpath) as ifh:
|
||||||
try:
|
try:
|
||||||
out = func(json.load(ifh), limit)
|
out = func(json.load(ifh), limit)
|
||||||
except Exception: # nosec
|
except Exception: # nosec # noqa: S112
|
||||||
# simply ignore progress update should an error arise
|
# simply ignore progress update should an error arise
|
||||||
# might be malformed input for instance
|
# might be malformed input for instance
|
||||||
continue
|
continue
|
||||||
@ -115,7 +119,7 @@ class ProgressFileWatcher:
|
|||||||
json.dump(out, ofh)
|
json.dump(out, ofh)
|
||||||
|
|
||||||
|
|
||||||
def zimit(args=None):
|
def run(raw_args):
|
||||||
wait_until_options = ["load", "domcontentloaded", "networkidle"]
|
wait_until_options = ["load", "domcontentloaded", "networkidle"]
|
||||||
wait_until_all = wait_until_options + [
|
wait_until_all = wait_until_options + [
|
||||||
f"{a},{b}" for a, b in itertools.combinations(wait_until_options, 2)
|
f"{a},{b}" for a, b in itertools.combinations(wait_until_options, 2)
|
||||||
@ -131,7 +135,7 @@ def zimit(args=None):
|
|||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--urlFile",
|
"--urlFile",
|
||||||
help="If set, read a list of seed urls, " "one per line, from the specified",
|
help="If set, read a list of seed urls, one per line, from the specified",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers")
|
parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers")
|
||||||
@ -205,7 +209,8 @@ def zimit(args=None):
|
|||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--lang",
|
"--lang",
|
||||||
help="if set, sets the language used by the browser, should be ISO 639 language[-country] code",
|
help="if set, sets the language used by the browser, should be ISO 639 "
|
||||||
|
"language[-country] code",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -220,12 +225,21 @@ def zimit(args=None):
|
|||||||
help="Emulate mobile device by name from "
|
help="Emulate mobile device by name from "
|
||||||
"https://github.com/puppeteer/puppeteer/blob/"
|
"https://github.com/puppeteer/puppeteer/blob/"
|
||||||
"main/packages/puppeteer-core/src/common/Device.ts",
|
"main/packages/puppeteer-core/src/common/Device.ts",
|
||||||
|
default="Pixel 2",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--noMobileDevice",
|
||||||
|
help="Do not emulate a mobile device (use at your own risk, behavior is"
|
||||||
|
"uncertain)",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--userAgent",
|
"--userAgent",
|
||||||
help="Override default user-agent with specified value ; --userAgentSuffix is still applied",
|
help="Override default user-agent with specified value ; --userAgentSuffix and "
|
||||||
default=DEFAULT_USER_AGENT,
|
"--adminEmail have no effect when this is set",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -333,7 +347,38 @@ def zimit(args=None):
|
|||||||
"to configure the crawling behaviour if not set via argument.",
|
"to configure the crawling behaviour if not set via argument.",
|
||||||
)
|
)
|
||||||
|
|
||||||
zimit_args, warc2zim_args = parser.parse_known_args(args)
|
parser.add_argument(
|
||||||
|
"--version",
|
||||||
|
help="Display scraper version and exit",
|
||||||
|
action="version",
|
||||||
|
version=f"Zimit {__version__}",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--logging",
|
||||||
|
help="Crawler logging configuration",
|
||||||
|
)
|
||||||
|
|
||||||
|
zimit_args, warc2zim_args = parser.parse_known_args(raw_args)
|
||||||
|
|
||||||
|
logger.info("Checking browsertrix-crawler version")
|
||||||
|
crawl_version_cmd = ["crawl", "--version"]
|
||||||
|
try:
|
||||||
|
crawl = subprocess.run(
|
||||||
|
crawl_version_cmd, check=True, capture_output=True, text=True
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
logger.error("Failed to get Browsertrix crawler version")
|
||||||
|
raise
|
||||||
|
crawler_version = crawl.stdout.strip()
|
||||||
|
logger.info(f"Browsertrix crawler: version {crawler_version}")
|
||||||
|
|
||||||
|
# pass a scraper suffix to warc2zim so that both zimit, warc2zim and crawler
|
||||||
|
# versions are associated with the ZIM
|
||||||
|
warc2zim_args.append("--scraper-suffix")
|
||||||
|
warc2zim_args.append(
|
||||||
|
f" + zimit {__version__} + Browsertrix crawler {crawler_version}"
|
||||||
|
)
|
||||||
|
|
||||||
# pass url and output to warc2zim also
|
# pass url and output to warc2zim also
|
||||||
if zimit_args.output:
|
if zimit_args.output:
|
||||||
@ -342,14 +387,12 @@ def zimit(args=None):
|
|||||||
|
|
||||||
url = zimit_args.url
|
url = zimit_args.url
|
||||||
|
|
||||||
user_agent = zimit_args.userAgent
|
user_agent_suffix = zimit_args.userAgentSuffix
|
||||||
if zimit_args.userAgentSuffix:
|
|
||||||
user_agent += f" {zimit_args.userAgentSuffix}"
|
|
||||||
if zimit_args.adminEmail:
|
if zimit_args.adminEmail:
|
||||||
user_agent += f" {zimit_args.adminEmail}"
|
user_agent_suffix += f" {zimit_args.adminEmail}"
|
||||||
|
|
||||||
if url:
|
if url:
|
||||||
url = check_url(url, user_agent, zimit_args.scopeType)
|
url = get_cleaned_url(url)
|
||||||
warc2zim_args.append("--url")
|
warc2zim_args.append("--url")
|
||||||
warc2zim_args.append(url)
|
warc2zim_args.append(url)
|
||||||
|
|
||||||
@ -372,13 +415,13 @@ def zimit(args=None):
|
|||||||
warc2zim_args.append("--lang")
|
warc2zim_args.append("--lang")
|
||||||
warc2zim_args.append(zimit_args.zim_lang)
|
warc2zim_args.append(zimit_args.zim_lang)
|
||||||
|
|
||||||
print("----------")
|
logger.info("----------")
|
||||||
print("Testing warc2zim args")
|
logger.info("Testing warc2zim args")
|
||||||
print("Running: warc2zim " + " ".join(warc2zim_args), flush=True)
|
logger.info("Running: warc2zim " + " ".join(warc2zim_args))
|
||||||
res = warc2zim(warc2zim_args)
|
res = warc2zim(warc2zim_args)
|
||||||
if res != 100:
|
if res != NORMAL_WARC2ZIM_EXIT_CODE:
|
||||||
print("Exiting, invalid warc2zim params")
|
logger.info("Exiting, invalid warc2zim params")
|
||||||
return 2
|
return EXIT_CODE_WARC2ZIM_CHECK_FAILED
|
||||||
|
|
||||||
# make temp dir for this crawl
|
# make temp dir for this crawl
|
||||||
if zimit_args.build:
|
if zimit_args.build:
|
||||||
@ -389,9 +432,9 @@ def zimit(args=None):
|
|||||||
if not zimit_args.keep:
|
if not zimit_args.keep:
|
||||||
|
|
||||||
def cleanup():
|
def cleanup():
|
||||||
print("")
|
logger.info("")
|
||||||
print("----------")
|
logger.info("----------")
|
||||||
print(f"Cleanup, removing temp dir: {temp_root_dir}", flush=True)
|
logger.info(f"Cleanup, removing temp dir: {temp_root_dir}")
|
||||||
shutil.rmtree(temp_root_dir)
|
shutil.rmtree(temp_root_dir)
|
||||||
|
|
||||||
atexit.register(cleanup)
|
atexit.register(cleanup)
|
||||||
@ -401,8 +444,12 @@ def zimit(args=None):
|
|||||||
cmd_args.append("--url")
|
cmd_args.append("--url")
|
||||||
cmd_args.append(url)
|
cmd_args.append(url)
|
||||||
|
|
||||||
cmd_args.append("--userAgent")
|
cmd_args.append("--userAgentSuffix")
|
||||||
cmd_args.append(user_agent)
|
cmd_args.append(user_agent_suffix)
|
||||||
|
|
||||||
|
if not zimit_args.noMobileDevice:
|
||||||
|
cmd_args.append("--mobileDevice")
|
||||||
|
cmd_args.append(zimit_args.mobileDevice)
|
||||||
|
|
||||||
cmd_args.append("--cwd")
|
cmd_args.append("--cwd")
|
||||||
cmd_args.append(str(temp_root_dir))
|
cmd_args.append(str(temp_root_dir))
|
||||||
@ -412,7 +459,7 @@ def zimit(args=None):
|
|||||||
watcher = ProgressFileWatcher(
|
watcher = ProgressFileWatcher(
|
||||||
Path(zimit_args.output), Path(zimit_args.statsFilename)
|
Path(zimit_args.output), Path(zimit_args.statsFilename)
|
||||||
)
|
)
|
||||||
print(f"Writing progress to {watcher.stats_path}")
|
logger.info(f"Writing progress to {watcher.stats_path}")
|
||||||
# update crawler command
|
# update crawler command
|
||||||
cmd_args.append("--statsFilename")
|
cmd_args.append("--statsFilename")
|
||||||
cmd_args.append(str(watcher.crawl_path))
|
cmd_args.append(str(watcher.crawl_path))
|
||||||
@ -424,15 +471,16 @@ def zimit(args=None):
|
|||||||
|
|
||||||
cmd_line = " ".join(cmd_args)
|
cmd_line = " ".join(cmd_args)
|
||||||
|
|
||||||
print("")
|
logger.info("")
|
||||||
print("----------")
|
logger.info("----------")
|
||||||
print(
|
logger.info(
|
||||||
f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}"
|
f"Output to tempdir: {temp_root_dir} - "
|
||||||
|
f"{'will keep' if zimit_args.keep else 'will delete'}"
|
||||||
)
|
)
|
||||||
print(f"Running browsertrix-crawler crawl: {cmd_line}", flush=True)
|
logger.info(f"Running browsertrix-crawler crawl: {cmd_line}")
|
||||||
crawl = subprocess.run(cmd_args)
|
crawl = subprocess.run(cmd_args, check=False)
|
||||||
if crawl.returncode == 11:
|
if crawl.returncode == EXIT_CODE_CRAWLER_LIMIT_HIT:
|
||||||
print("crawl interupted by a limit")
|
logger.info("crawl interupted by a limit")
|
||||||
elif crawl.returncode != 0:
|
elif crawl.returncode != 0:
|
||||||
raise subprocess.CalledProcessError(crawl.returncode, cmd_args)
|
raise subprocess.CalledProcessError(crawl.returncode, cmd_args)
|
||||||
|
|
||||||
@ -447,65 +495,33 @@ def zimit(args=None):
|
|||||||
"Failed to find directory where WARC files have been created"
|
"Failed to find directory where WARC files have been created"
|
||||||
)
|
)
|
||||||
elif len(warc_dirs) > 1:
|
elif len(warc_dirs) > 1:
|
||||||
print("Found many WARC files directories, only last one will be used")
|
logger.info("Found many WARC files directories, only last one will be used")
|
||||||
for directory in warc_dirs:
|
for directory in warc_dirs:
|
||||||
print(f"- {directory}")
|
logger.info(f"- {directory}")
|
||||||
warc_directory = warc_dirs[-1]
|
warc_directory = warc_dirs[-1]
|
||||||
|
|
||||||
print("")
|
logger.info("")
|
||||||
print("----------")
|
logger.info("----------")
|
||||||
print(f"Processing WARC files in {warc_directory}")
|
logger.info(f"Processing WARC files in {warc_directory}")
|
||||||
warc2zim_args.append(str(warc_directory))
|
warc2zim_args.append(str(warc_directory))
|
||||||
|
|
||||||
num_files = sum(1 for _ in warc_directory.iterdir())
|
num_files = sum(1 for _ in warc_directory.iterdir())
|
||||||
print(f"{num_files} WARC files found", flush=True)
|
logger.info(f"{num_files} WARC files found")
|
||||||
print(f"Calling warc2zim with these args: {warc2zim_args}", flush=True)
|
logger.info(f"Calling warc2zim with these args: {warc2zim_args}")
|
||||||
|
|
||||||
return warc2zim(warc2zim_args)
|
return warc2zim(warc2zim_args)
|
||||||
|
|
||||||
|
|
||||||
def check_url(url, user_agent, scope=None):
|
def get_cleaned_url(url: str):
|
||||||
url = urllib.parse.urlparse(url)
|
parsed_url = urllib.parse.urlparse(url)
|
||||||
try:
|
|
||||||
with requests.get(
|
|
||||||
url.geturl(),
|
|
||||||
stream=True,
|
|
||||||
allow_redirects=True,
|
|
||||||
timeout=(12.2, 27),
|
|
||||||
headers={"User-Agent": user_agent},
|
|
||||||
) as resp:
|
|
||||||
resp.raise_for_status()
|
|
||||||
except requests.exceptions.RequestException as exc:
|
|
||||||
print(f"failed to connect to {url.geturl()}: {exc}", flush=True)
|
|
||||||
raise SystemExit(1)
|
|
||||||
actual_url = urllib.parse.urlparse(resp.url)
|
|
||||||
|
|
||||||
# remove explicit port in URI for default-for-scheme as browsers does it
|
# remove explicit port in URI for default-for-scheme as browsers does it
|
||||||
if actual_url.scheme == "https" and actual_url.port == 443:
|
if parsed_url.scheme == "https" and parsed_url.port == 443: # noqa: PLR2004
|
||||||
actual_url = rebuild_uri(actual_url, port="")
|
parsed_url = rebuild_uri(parsed_url, port="")
|
||||||
if actual_url.scheme == "http" and actual_url.port == 80:
|
if parsed_url.scheme == "http" and parsed_url.port == 80: # noqa: PLR2004
|
||||||
actual_url = rebuild_uri(actual_url, port="")
|
parsed_url = rebuild_uri(parsed_url, port="")
|
||||||
|
|
||||||
if actual_url.geturl() != url.geturl():
|
return parsed_url.geturl()
|
||||||
if scope in (None, "any"):
|
|
||||||
return actual_url.geturl()
|
|
||||||
|
|
||||||
print(
|
|
||||||
"[WARN] Your URL ({0}) redirects to {1} which {2} on same "
|
|
||||||
"first-level domain. Depending on your scopeType ({3}), "
|
|
||||||
"your homepage might be out-of-scope. Please check!".format(
|
|
||||||
url.geturl(),
|
|
||||||
actual_url.geturl(),
|
|
||||||
"is"
|
|
||||||
if get_fld(url.geturl()) == get_fld(actual_url.geturl())
|
|
||||||
else "is not",
|
|
||||||
scope,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
return actual_url.geturl()
|
|
||||||
|
|
||||||
return url.geturl()
|
|
||||||
|
|
||||||
|
|
||||||
def get_node_cmd_line(args):
|
def get_node_cmd_line(args):
|
||||||
@ -527,7 +543,7 @@ def get_node_cmd_line(args):
|
|||||||
"collection",
|
"collection",
|
||||||
"allowHashUrls",
|
"allowHashUrls",
|
||||||
"lang",
|
"lang",
|
||||||
"mobileDevice",
|
"userAgent",
|
||||||
"useSitemap",
|
"useSitemap",
|
||||||
"behaviors",
|
"behaviors",
|
||||||
"behaviorTimeout",
|
"behaviorTimeout",
|
||||||
@ -539,9 +555,10 @@ def get_node_cmd_line(args):
|
|||||||
"healthCheckPort",
|
"healthCheckPort",
|
||||||
"overwrite",
|
"overwrite",
|
||||||
"config",
|
"config",
|
||||||
|
"logging",
|
||||||
]:
|
]:
|
||||||
value = getattr(args, arg)
|
value = getattr(args, arg)
|
||||||
if value == None or (isinstance(value, bool) and value == False):
|
if value is None or (isinstance(value, bool) and value is False):
|
||||||
continue
|
continue
|
||||||
node_cmd.append("--" + arg)
|
node_cmd.append("--" + arg)
|
||||||
if not isinstance(value, bool):
|
if not isinstance(value, bool):
|
||||||
@ -550,17 +567,22 @@ def get_node_cmd_line(args):
|
|||||||
return node_cmd
|
return node_cmd
|
||||||
|
|
||||||
|
|
||||||
def sigint_handler(*args):
|
def sigint_handler(*args): # noqa: ARG001
|
||||||
print("")
|
logger.info("")
|
||||||
print("")
|
logger.info("")
|
||||||
print("SIGINT/SIGTERM received, stopping zimit")
|
logger.info("SIGINT/SIGTERM received, stopping zimit")
|
||||||
print("")
|
logger.info("")
|
||||||
print("", flush=True)
|
logger.info("")
|
||||||
sys.exit(3)
|
sys.exit(3)
|
||||||
|
|
||||||
|
|
||||||
|
def zimit():
|
||||||
|
run(sys.argv[1:])
|
||||||
|
|
||||||
|
|
||||||
signal.signal(signal.SIGINT, sigint_handler)
|
signal.signal(signal.SIGINT, sigint_handler)
|
||||||
signal.signal(signal.SIGTERM, sigint_handler)
|
signal.signal(signal.SIGTERM, sigint_handler)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
zimit()
|
zimit()
|
109
tasks.py
Normal file
109
tasks.py
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
# pyright: strict, reportUntypedFunctionDecorator=false
|
||||||
|
import os
|
||||||
|
|
||||||
|
from invoke.context import Context
|
||||||
|
from invoke.tasks import task # pyright: ignore [reportUnknownVariableType]
|
||||||
|
|
||||||
|
use_pty = not os.getenv("CI", "")
|
||||||
|
|
||||||
|
|
||||||
|
@task(optional=["args"], help={"args": "pytest additional arguments"})
|
||||||
|
def test(ctx: Context, args: str = ""):
|
||||||
|
"""run tests (without coverage)"""
|
||||||
|
ctx.run(f"pytest {args}", pty=use_pty)
|
||||||
|
|
||||||
|
|
||||||
|
@task(optional=["args"], help={"args": "pytest additional arguments"})
|
||||||
|
def test_cov(ctx: Context, args: str = ""):
|
||||||
|
"""run test vith coverage"""
|
||||||
|
ctx.run(f"coverage run -m pytest {args}", pty=use_pty)
|
||||||
|
|
||||||
|
|
||||||
|
@task(optional=["html"], help={"html": "flag to export html report"})
|
||||||
|
def report_cov(ctx: Context, *, html: bool = False):
|
||||||
|
"""report coverage"""
|
||||||
|
ctx.run("coverage combine", warn=True, pty=use_pty)
|
||||||
|
ctx.run("coverage report --show-missing", pty=use_pty)
|
||||||
|
if html:
|
||||||
|
ctx.run("coverage html", pty=use_pty)
|
||||||
|
|
||||||
|
|
||||||
|
@task(
|
||||||
|
optional=["args", "html"],
|
||||||
|
help={
|
||||||
|
"args": "pytest additional arguments",
|
||||||
|
"html": "flag to export html report",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
def coverage(ctx: Context, args: str = "", *, html: bool = False):
|
||||||
|
"""run tests and report coverage"""
|
||||||
|
test_cov(ctx, args=args)
|
||||||
|
report_cov(ctx, html=html)
|
||||||
|
|
||||||
|
|
||||||
|
@task(optional=["args"], help={"args": "black additional arguments"})
|
||||||
|
def lint_black(ctx: Context, args: str = "."):
|
||||||
|
args = args or "." # needed for hatch script
|
||||||
|
ctx.run("black --version", pty=use_pty)
|
||||||
|
ctx.run(f"black --check --diff {args}", pty=use_pty)
|
||||||
|
|
||||||
|
|
||||||
|
@task(optional=["args"], help={"args": "ruff additional arguments"})
|
||||||
|
def lint_ruff(ctx: Context, args: str = "."):
|
||||||
|
args = args or "." # needed for hatch script
|
||||||
|
ctx.run("ruff --version", pty=use_pty)
|
||||||
|
ctx.run(f"ruff check {args}", pty=use_pty)
|
||||||
|
|
||||||
|
|
||||||
|
@task(
|
||||||
|
optional=["args"],
|
||||||
|
help={
|
||||||
|
"args": "linting tools (black, ruff) additional arguments, typically a path",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
def lintall(ctx: Context, args: str = "."):
|
||||||
|
"""Check linting"""
|
||||||
|
args = args or "." # needed for hatch script
|
||||||
|
lint_black(ctx, args)
|
||||||
|
lint_ruff(ctx, args)
|
||||||
|
|
||||||
|
|
||||||
|
@task(optional=["args"], help={"args": "check tools (pyright) additional arguments"})
|
||||||
|
def check_pyright(ctx: Context, args: str = ""):
|
||||||
|
"""check static types with pyright"""
|
||||||
|
ctx.run("pyright --version")
|
||||||
|
ctx.run(f"pyright {args}", pty=use_pty)
|
||||||
|
|
||||||
|
|
||||||
|
@task(optional=["args"], help={"args": "check tools (pyright) additional arguments"})
|
||||||
|
def checkall(ctx: Context, args: str = ""):
|
||||||
|
"""check static types"""
|
||||||
|
check_pyright(ctx, args)
|
||||||
|
|
||||||
|
|
||||||
|
@task(optional=["args"], help={"args": "black additional arguments"})
|
||||||
|
def fix_black(ctx: Context, args: str = "."):
|
||||||
|
"""fix black formatting"""
|
||||||
|
args = args or "." # needed for hatch script
|
||||||
|
ctx.run(f"black {args}", pty=use_pty)
|
||||||
|
|
||||||
|
|
||||||
|
@task(optional=["args"], help={"args": "ruff additional arguments"})
|
||||||
|
def fix_ruff(ctx: Context, args: str = "."):
|
||||||
|
"""fix all ruff rules"""
|
||||||
|
args = args or "." # needed for hatch script
|
||||||
|
ctx.run(f"ruff check --fix {args}", pty=use_pty)
|
||||||
|
|
||||||
|
|
||||||
|
@task(
|
||||||
|
optional=["args"],
|
||||||
|
help={
|
||||||
|
"args": "linting tools (black, ruff) additional arguments, typically a path",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
def fixall(ctx: Context, args: str = "."):
|
||||||
|
"""Fix everything automatically"""
|
||||||
|
args = args or "." # needed for hatch script
|
||||||
|
fix_black(ctx, args)
|
||||||
|
fix_ruff(ctx, args)
|
||||||
|
lintall(ctx, args)
|
1
tests-integration/README.md
Normal file
1
tests-integration/README.md
Normal file
@ -0,0 +1 @@
|
|||||||
|
These are integration tests, meant to be ran inside the CI (because we need to first perform a zimit run on a given website and then check its output)
|
@ -1,14 +1,9 @@
|
|||||||
import os
|
|
||||||
import glob
|
import glob
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
import libzim.reader
|
|
||||||
from warcio import ArchiveIterator
|
from warcio import ArchiveIterator
|
||||||
|
from zimscraperlib.zim import Archive
|
||||||
|
|
||||||
def get_zim_article(zimfile, path):
|
|
||||||
zim_fh = libzim.reader.Archive(zimfile)
|
|
||||||
return zim_fh.get_entry_by_path(path).get_item().content.tobytes()
|
|
||||||
|
|
||||||
|
|
||||||
def test_is_file():
|
def test_is_file():
|
||||||
@ -20,20 +15,34 @@ def test_zim_main_page():
|
|||||||
"""Main page specified, http://isago.rskg.org/, was a redirect to https
|
"""Main page specified, http://isago.rskg.org/, was a redirect to https
|
||||||
Ensure main page is the redirected page"""
|
Ensure main page is the redirected page"""
|
||||||
|
|
||||||
assert b'"https://isago.rskg.org/"' in get_zim_article(
|
main_entry = Archive("/output/isago.zim").main_entry
|
||||||
"/output/isago.zim", "A/index.html"
|
assert main_entry.is_redirect
|
||||||
)
|
assert main_entry.get_redirect_entry().path == "isago.rskg.org/"
|
||||||
|
|
||||||
|
|
||||||
|
def test_zim_scraper():
|
||||||
|
"""Main page specified, http://isago.rskg.org/, was a redirect to https
|
||||||
|
Ensure main page is the redirected page"""
|
||||||
|
|
||||||
|
zim_fh = Archive("/output/isago.zim")
|
||||||
|
scraper = zim_fh.get_text_metadata("Scraper")
|
||||||
|
assert "zimit " in scraper
|
||||||
|
assert "warc2zim " in scraper
|
||||||
|
assert "Browsertrix crawler " in scraper
|
||||||
|
|
||||||
|
|
||||||
def test_user_agent():
|
def test_user_agent():
|
||||||
"""Test that mobile user agent was used in WARC request records with custom Zimit and email suffix"""
|
"""Test that mobile user agent was used
|
||||||
|
|
||||||
|
Check is done in WARC request records with custom Zimit and email suffix
|
||||||
|
"""
|
||||||
|
|
||||||
found = False
|
found = False
|
||||||
for warc in glob.glob("/output/.tmp*/collections/crawl-*/archive/*.warc.gz"):
|
for warc in glob.glob("/output/.tmp*/collections/crawl-*/archive/*.warc.gz"):
|
||||||
with open(warc, "rb") as fh:
|
with open(warc, "rb") as fh:
|
||||||
for record in ArchiveIterator(fh):
|
for record in ArchiveIterator(fh):
|
||||||
if record.rec_type == "request":
|
if record.rec_type == "request":
|
||||||
print(record.http_headers)
|
print(record.http_headers) # noqa: T201
|
||||||
ua = record.http_headers.get_header("User-Agent")
|
ua = record.http_headers.get_header("User-Agent")
|
||||||
if ua:
|
if ua:
|
||||||
assert "Mozilla" in ua
|
assert "Mozilla" in ua
|
||||||
@ -56,12 +65,12 @@ def test_stats_output():
|
|||||||
}
|
}
|
||||||
with open("/output/warc2zim.json") as fh:
|
with open("/output/warc2zim.json") as fh:
|
||||||
assert json.loads(fh.read()) == {
|
assert json.loads(fh.read()) == {
|
||||||
"written": 8,
|
"written": 7,
|
||||||
"total": 8,
|
"total": 7,
|
||||||
}
|
}
|
||||||
with open("/output/stats.json") as fh:
|
with open("/output/stats.json") as fh:
|
||||||
assert json.loads(fh.read()) == {
|
assert json.loads(fh.read()) == {
|
||||||
"done": 8,
|
"done": 7,
|
||||||
"total": 8,
|
"total": 7,
|
||||||
"limit": {"max": 0, "hit": False},
|
"limit": {"max": 0, "hit": False},
|
||||||
}
|
}
|
6
tests/test_dummy.py
Normal file
6
tests/test_dummy.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
from zimit.zimit import NORMAL_WARC2ZIM_EXIT_CODE
|
||||||
|
|
||||||
|
|
||||||
|
# dummy test, just to have coverage report done
|
||||||
|
def test_something_exists():
|
||||||
|
assert NORMAL_WARC2ZIM_EXIT_CODE
|
Loading…
x
Reference in New Issue
Block a user