mirror of
https://github.com/openzim/zimit.git
synced 2025-09-22 11:22:23 -04:00
Merge branch 'zimit2'
This commit is contained in:
commit
ce49a5d4e9
@ -1,2 +0,0 @@
|
||||
output/
|
||||
node_modules/
|
@ -1,26 +1,20 @@
|
||||
name: Docker
|
||||
name: Publish released version
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
tags:
|
||||
- v*
|
||||
release:
|
||||
types: [published]
|
||||
|
||||
jobs:
|
||||
build-and-push:
|
||||
name: Deploy Docker Image
|
||||
publish:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- name: Retrieve source code
|
||||
uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Build and push
|
||||
- name: Build and push Docker image
|
||||
uses: openzim/docker-publish-action@v10
|
||||
with:
|
||||
image-name: openzim/zimit
|
||||
on-master: dev
|
||||
tag-pattern: /^v([0-9.]+)$/
|
||||
latest-on-tag: true
|
||||
restrict-to: openzim/zimit
|
30
.github/workflows/PublishDockerDevImage.yaml
vendored
Normal file
30
.github/workflows/PublishDockerDevImage.yaml
vendored
Normal file
@ -0,0 +1,30 @@
|
||||
name: Publish Docker dev image
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
jobs:
|
||||
publish:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Build and push Docker image
|
||||
uses: openzim/docker-publish-action@v10
|
||||
with:
|
||||
image-name: openzim/zimit
|
||||
manual-tag: dev
|
||||
latest-on-tag: false
|
||||
restrict-to: openzim/zimit
|
||||
registries: ghcr.io
|
||||
credentials:
|
||||
GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
|
||||
GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
|
||||
repo_description: auto
|
||||
repo_overview: auto
|
||||
platforms: |
|
||||
linux/amd64
|
||||
linux/arm64
|
30
.github/workflows/PublishDockerZimit2Image.yaml
vendored
Normal file
30
.github/workflows/PublishDockerZimit2Image.yaml
vendored
Normal file
@ -0,0 +1,30 @@
|
||||
name: Publish Docker zimit2 image
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- zimit2
|
||||
|
||||
jobs:
|
||||
publish:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Build and push Docker image
|
||||
uses: openzim/docker-publish-action@v10
|
||||
with:
|
||||
image-name: openzim/zimit
|
||||
manual-tag: zimit2
|
||||
latest-on-tag: false
|
||||
restrict-to: openzim/zimit
|
||||
registries: ghcr.io
|
||||
credentials:
|
||||
GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
|
||||
GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
|
||||
repo_description: auto
|
||||
repo_overview: auto
|
||||
platforms: |
|
||||
linux/amd64
|
||||
linux/arm64
|
34
.github/workflows/QA.yaml
vendored
Normal file
34
.github/workflows/QA.yaml
vendored
Normal file
@ -0,0 +1,34 @@
|
||||
name: QA
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
jobs:
|
||||
check-qa:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version-file: pyproject.toml
|
||||
architecture: x64
|
||||
|
||||
- name: Install dependencies (and project)
|
||||
run: |
|
||||
pip install -U pip
|
||||
pip install -e .[lint,scripts,test,check]
|
||||
|
||||
- name: Check black formatting
|
||||
run: inv lint-black
|
||||
|
||||
- name: Check ruff
|
||||
run: inv lint-ruff
|
||||
|
||||
- name: Check pyright
|
||||
run: inv check-pyright
|
66
.github/workflows/Tests.yaml
vendored
Normal file
66
.github/workflows/Tests.yaml
vendored
Normal file
@ -0,0 +1,66 @@
|
||||
name: Tests
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
jobs:
|
||||
run-tests:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version-file: pyproject.toml
|
||||
architecture: x64
|
||||
|
||||
- name: Install dependencies (and project)
|
||||
run: |
|
||||
pip install -U pip
|
||||
pip install -e .[test,scripts]
|
||||
|
||||
- name: Run the tests
|
||||
run: inv coverage --args "-vvv"
|
||||
|
||||
- name: Upload coverage report to codecov
|
||||
uses: codecov/codecov-action@v3
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
|
||||
build_python:
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version-file: pyproject.toml
|
||||
architecture: x64
|
||||
|
||||
- name: Ensure we can build Python targets
|
||||
run: |
|
||||
pip install -U pip build
|
||||
python3 -m build --sdist --wheel
|
||||
|
||||
# this job replaces the standard "build_docker" job since it builds the docker image
|
||||
run-integration-tests:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- name: checkout
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: build image
|
||||
run: docker build -t zimit .
|
||||
|
||||
- name: run crawl
|
||||
run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
|
||||
|
||||
- name: run integration test suite
|
||||
run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py"
|
20
.github/workflows/ci.yaml
vendored
20
.github/workflows/ci.yaml
vendored
@ -1,20 +0,0 @@
|
||||
name: CI
|
||||
|
||||
on: push
|
||||
|
||||
jobs:
|
||||
integration-tests:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- name: checkout
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: build image
|
||||
run: docker build -t zimit .
|
||||
|
||||
- name: run crawl
|
||||
run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
|
||||
|
||||
- name: run integration test suite
|
||||
run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v ./integration.py"
|
27
.pre-commit-config.yaml
Normal file
27
.pre-commit-config.yaml
Normal file
@ -0,0 +1,27 @@
|
||||
# See https://pre-commit.com for more information
|
||||
# See https://pre-commit.com/hooks.html for more hooks
|
||||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v4.4.0
|
||||
hooks:
|
||||
- id: trailing-whitespace
|
||||
- id: end-of-file-fixer
|
||||
- repo: https://github.com/psf/black
|
||||
rev: "24.2.0"
|
||||
hooks:
|
||||
- id: black
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.3.0
|
||||
hooks:
|
||||
- id: ruff
|
||||
- repo: https://github.com/RobertCraigie/pyright-python
|
||||
rev: v1.1.352
|
||||
hooks:
|
||||
- id: pyright
|
||||
name: pyright (system)
|
||||
description: 'pyright static type checker'
|
||||
entry: pyright
|
||||
language: system
|
||||
'types_or': [python, pyi]
|
||||
require_serial: true
|
||||
minimum_pre_commit_version: '2.9.2'
|
22
CHANGELOG.md
22
CHANGELOG.md
@ -7,6 +7,28 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
|
||||
- New `--version` flag to display Zimit version
|
||||
- New `--logging` flag to adjust Browsertrix Crawler logging (#273)
|
||||
- Use new `--scraper-suffix` flag of warc2zim to enhance ZIM "Scraper" metadata (#275)
|
||||
- New `--noMobileDevice` CLI argument
|
||||
|
||||
### Changed
|
||||
|
||||
- Use `warc2zim` version 2, which works without Service Worker anymore
|
||||
- Using `warc2zim2` warc2zim ⚠️ change before releasing!
|
||||
- Build temporary `zimit2` Docker image for testing ⚠️ remove before releasing!
|
||||
- Adopt Python bootstrap conventions
|
||||
- Removed handling of redirects by zimit, they are handled by browsertrix crawler and detected properly by warc2zim
|
||||
- Upgrade to Python 3.12 + upgrade dependencies
|
||||
- `--userAgent` CLI argument overrides again the `--userAgentSuffix` and `--adminEmail` values
|
||||
- `--userAgent` CLI arguement is not mandatory anymore
|
||||
- Upgraded Browsertrix Crawler to 1.0.3
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fix support for Youtube videos (#291)
|
||||
|
||||
## [1.6.3] - 2024-01-18
|
||||
|
||||
|
50
Dockerfile
50
Dockerfile
@ -1,36 +1,48 @@
|
||||
FROM webrecorder/browsertrix-crawler:0.12.4
|
||||
FROM webrecorder/browsertrix-crawler:1.1.1
|
||||
LABEL org.opencontainers.image.source https://github.com/openzim/zimit
|
||||
|
||||
# add deadsnakes ppa for Python 3.12 on Ubuntu Jammy
|
||||
RUN add-apt-repository ppa:deadsnakes/ppa -y
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -qqy --no-install-recommends \
|
||||
libmagic1 \
|
||||
python3.10-venv \
|
||||
python3.12-venv \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
# python setup (in venv not to conflict with browsertrix)
|
||||
&& python3 -m venv /app/zimit \
|
||||
&& /app/zimit/bin/python -m pip install --no-cache-dir 'requests==2.31.0' 'inotify==0.2.10' 'tld==0.13' \
|
||||
'warc2zim==1.5.5' \
|
||||
&& python3.12 -m venv /app/zimit \
|
||||
# placeholder (default output location)
|
||||
&& mkdir -p /output \
|
||||
# disable chrome upgrade
|
||||
&& printf "repo_add_once=\"false\"\nrepo_reenable_on_distupgrade=\"false\"\n" > /etc/default/google-chrome \
|
||||
# download list of bad domains to filter-out. intentionnaly ran post-install \
|
||||
# so it's not cached in earlier layers (url stays same but content updated) \
|
||||
mkdir -p /tmp/ads && cd /tmp/ads && \
|
||||
curl -L -O https://hosts.anudeep.me/mirror/adservers.txt && \
|
||||
curl -L -O https://hosts.anudeep.me/mirror/CoinMiner.txt && \
|
||||
curl -L -O https://hosts.anudeep.me/mirror/facebook.txt && \
|
||||
cat ./*.txt > /etc/blocklist.txt \
|
||||
&& mkdir -p /tmp/ads \
|
||||
&& cd /tmp/ads \
|
||||
&& curl -L -O https://hosts.anudeep.me/mirror/adservers.txt \
|
||||
&& curl -L -O https://hosts.anudeep.me/mirror/CoinMiner.txt \
|
||||
&& curl -L -O https://hosts.anudeep.me/mirror/facebook.txt \
|
||||
&& cat ./*.txt > /etc/blocklist.txt \
|
||||
&& rm ./*.txt \
|
||||
&& printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh && \
|
||||
chmod +x /usr/local/bin/entrypoint.sh
|
||||
&& printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh \
|
||||
&& chmod +x /usr/local/bin/entrypoint.sh
|
||||
|
||||
WORKDIR /app
|
||||
ADD zimit.py /app/
|
||||
# fix shebang on zimit to use in-venv python
|
||||
RUN sed -i.bak "1 s/.*/#!\/app\/zimit\/bin\/python3/" /app/zimit.py \
|
||||
&& ln -s /app/zimit.py /usr/bin/zimit \
|
||||
&& chmod +x /usr/bin/zimit
|
||||
# Copy pyproject.toml and its dependencies
|
||||
COPY pyproject.toml README.md /src/
|
||||
COPY src/zimit/__about__.py /src/src/zimit/__about__.py
|
||||
|
||||
# Install Python dependencies
|
||||
RUN . /app/zimit/bin/activate && python -m pip install --no-cache-dir /src
|
||||
|
||||
# Copy code + associated artifacts
|
||||
COPY src /src/src
|
||||
COPY *.md /src/
|
||||
|
||||
# Install + cleanup
|
||||
RUN . /app/zimit/bin/activate && python -m pip install --no-cache-dir /src \
|
||||
&& ln -s /app/zimit/bin/zimit /usr/bin/zimit \
|
||||
&& chmod +x /usr/bin/zimit \
|
||||
&& rm -rf /src
|
||||
|
||||
ENTRYPOINT ["entrypoint.sh"]
|
||||
CMD ["zimit"]
|
||||
CMD ["zimit", "--help"]
|
||||
|
10
README.md
10
README.md
@ -3,12 +3,9 @@ Zimit
|
||||
|
||||
Zimit is a scraper allowing to create ZIM file from any Web site.
|
||||
|
||||
[](https://ghcr.io/openzim/zimit)
|
||||
[](https://github.com/openzim/zimit/actions?query=branch%3Amain)
|
||||
[](https://www.codefactor.io/repository/github/openzim/zimit)
|
||||
[](https://www.gnu.org/licenses/gpl-3.0)
|
||||
|
||||
⚠️ **Important**: this tool uses [warc2zim](https://github.com/openzim/warc2zim) to create Zim files and thus require the Zim reader to support *Service Workers*. At the time of `zimit:1.0`, that's mostly kiwix-android and kiwix-serve. Note that service workers have protocol restrictions as well so you'll need to run it either from `localhost` or over HTTPS.
|
||||
[](https://ghcr.io/openzim/zimit)
|
||||
|
||||
Technical background
|
||||
--------------------
|
||||
@ -68,7 +65,10 @@ default and prints the crawl status to the Docker log.
|
||||
Nota bene
|
||||
---------
|
||||
|
||||
A first version of a generic HTTP scraper was created in 2016 during
|
||||
While Zimit 1.x relied on a Service Worker to display the ZIM content, this is not anymore the case
|
||||
since Zimit 2.x which does not have any special requirements anymore.
|
||||
|
||||
It should also be noted that a first version of a generic HTTP scraper was created in 2016 during
|
||||
the [Wikimania Esino Lario
|
||||
Hackathon](https://wikimania2016.wikimedia.org/wiki/Programme/Kiwix-dedicated_Hackathon).
|
||||
|
||||
|
224
pyproject.toml
Normal file
224
pyproject.toml
Normal file
@ -0,0 +1,224 @@
|
||||
[build-system]
|
||||
requires = ["hatchling", "hatch-openzim==0.2.0"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project]
|
||||
name = "zimit"
|
||||
requires-python = ">=3.12,<3.13"
|
||||
description = "Make ZIM file from any website through crawling"
|
||||
readme = "README.md"
|
||||
dependencies = [
|
||||
"requests==2.31.0",
|
||||
"inotify==0.2.10",
|
||||
"tld==0.13",
|
||||
"warc2zim @ git+https://github.com/openzim/warc2zim@warc2zim2",
|
||||
]
|
||||
dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
|
||||
|
||||
[tool.hatch.metadata.hooks.openzim-metadata]
|
||||
kind = "scraper"
|
||||
|
||||
[tool.hatch.metadata]
|
||||
allow-direct-references = true # to be removed once we use a released warc2zim version
|
||||
|
||||
[project.optional-dependencies]
|
||||
scripts = [
|
||||
"invoke==2.2.0",
|
||||
]
|
||||
lint = [
|
||||
"black==24.2.0",
|
||||
"ruff==0.3.0",
|
||||
]
|
||||
check = [
|
||||
"pyright==1.1.352",
|
||||
]
|
||||
test = [
|
||||
"pytest==8.0.2",
|
||||
"coverage==7.4.3",
|
||||
]
|
||||
dev = [
|
||||
"pre-commit==3.6.2",
|
||||
"debugpy==1.8.1",
|
||||
"zimit[scripts]",
|
||||
"zimit[lint]",
|
||||
"zimit[test]",
|
||||
"zimit[check]",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
zimit = "zimit:zimit.zimit"
|
||||
|
||||
[tool.hatch.version]
|
||||
path = "src/zimit/__about__.py"
|
||||
|
||||
[tool.hatch.build]
|
||||
exclude = [
|
||||
"/.github",
|
||||
]
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["src/zimit"]
|
||||
|
||||
[tool.hatch.envs.default]
|
||||
features = ["dev"]
|
||||
|
||||
[tool.hatch.envs.test]
|
||||
features = ["scripts", "test"]
|
||||
|
||||
[tool.hatch.envs.test.scripts]
|
||||
run = "inv test --args '{args}'"
|
||||
run-cov = "inv test-cov --args '{args}'"
|
||||
report-cov = "inv report-cov"
|
||||
coverage = "inv coverage --args '{args}'"
|
||||
html = "inv coverage --html --args '{args}'"
|
||||
|
||||
[tool.hatch.envs.lint]
|
||||
template = "lint"
|
||||
skip-install = false
|
||||
features = ["scripts", "lint"]
|
||||
|
||||
[tool.hatch.envs.lint.scripts]
|
||||
black = "inv lint-black --args '{args}'"
|
||||
ruff = "inv lint-ruff --args '{args}'"
|
||||
all = "inv lintall --args '{args}'"
|
||||
fix-black = "inv fix-black --args '{args}'"
|
||||
fix-ruff = "inv fix-ruff --args '{args}'"
|
||||
fixall = "inv fixall --args '{args}'"
|
||||
|
||||
[tool.hatch.envs.check]
|
||||
features = ["scripts", "check"]
|
||||
|
||||
[tool.hatch.envs.check.scripts]
|
||||
pyright = "inv check-pyright --args '{args}'"
|
||||
all = "inv checkall --args '{args}'"
|
||||
|
||||
[tool.black]
|
||||
line-length = 88
|
||||
target-version = ['py312']
|
||||
|
||||
[tool.ruff]
|
||||
target-version = "py312"
|
||||
line-length = 88
|
||||
src = ["src"]
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = [
|
||||
"A", # flake8-builtins
|
||||
# "ANN", # flake8-annotations
|
||||
"ARG", # flake8-unused-arguments
|
||||
# "ASYNC", # flake8-async
|
||||
"B", # flake8-bugbear
|
||||
# "BLE", # flake8-blind-except
|
||||
"C4", # flake8-comprehensions
|
||||
"C90", # mccabe
|
||||
# "COM", # flake8-commas
|
||||
# "D", # pydocstyle
|
||||
# "DJ", # flake8-django
|
||||
"DTZ", # flake8-datetimez
|
||||
"E", # pycodestyle (default)
|
||||
"EM", # flake8-errmsg
|
||||
# "ERA", # eradicate
|
||||
# "EXE", # flake8-executable
|
||||
"F", # Pyflakes (default)
|
||||
# "FA", # flake8-future-annotations
|
||||
"FBT", # flake8-boolean-trap
|
||||
# "FLY", # flynt
|
||||
# "G", # flake8-logging-format
|
||||
"I", # isort
|
||||
"ICN", # flake8-import-conventions
|
||||
# "INP", # flake8-no-pep420
|
||||
# "INT", # flake8-gettext
|
||||
"ISC", # flake8-implicit-str-concat
|
||||
"N", # pep8-naming
|
||||
# "NPY", # NumPy-specific rules
|
||||
# "PD", # pandas-vet
|
||||
# "PGH", # pygrep-hooks
|
||||
# "PIE", # flake8-pie
|
||||
# "PL", # Pylint
|
||||
"PLC", # Pylint: Convention
|
||||
"PLE", # Pylint: Error
|
||||
"PLR", # Pylint: Refactor
|
||||
"PLW", # Pylint: Warning
|
||||
# "PT", # flake8-pytest-style
|
||||
# "PTH", # flake8-use-pathlib
|
||||
# "PYI", # flake8-pyi
|
||||
"Q", # flake8-quotes
|
||||
# "RET", # flake8-return
|
||||
# "RSE", # flake8-raise
|
||||
"RUF", # Ruff-specific rules
|
||||
"S", # flake8-bandit
|
||||
# "SIM", # flake8-simplify
|
||||
# "SLF", # flake8-self
|
||||
"T10", # flake8-debugger
|
||||
"T20", # flake8-print
|
||||
# "TCH", # flake8-type-checking
|
||||
# "TD", # flake8-todos
|
||||
"TID", # flake8-tidy-imports
|
||||
# "TRY", # tryceratops
|
||||
"UP", # pyupgrade
|
||||
"W", # pycodestyle
|
||||
"YTT", # flake8-2020
|
||||
]
|
||||
ignore = [
|
||||
# Allow non-abstract empty methods in abstract base classes
|
||||
"B027",
|
||||
# Remove flake8-errmsg since we consider they bloat the code and provide limited value
|
||||
"EM",
|
||||
# Allow boolean positional values in function calls, like `dict.get(... True)`
|
||||
"FBT003",
|
||||
# Ignore checks for possible passwords
|
||||
"S105", "S106", "S107",
|
||||
# Ignore warnings on subprocess.run / popen
|
||||
"S603",
|
||||
# Ignore complexity
|
||||
"C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915",
|
||||
]
|
||||
unfixable = [
|
||||
# Don't touch unused imports
|
||||
"F401",
|
||||
]
|
||||
|
||||
[tool.ruff.lint.isort]
|
||||
known-first-party = ["zimit"]
|
||||
|
||||
[tool.ruff.lint.flake8-bugbear]
|
||||
# add exceptions to B008 for fastapi.
|
||||
extend-immutable-calls = ["fastapi.Depends", "fastapi.Query"]
|
||||
|
||||
[tool.ruff.lint.flake8-tidy-imports]
|
||||
ban-relative-imports = "all"
|
||||
|
||||
[tool.ruff.lint.per-file-ignores]
|
||||
# Tests can use magic values, assertions, and relative imports
|
||||
"tests**/**/*" = ["PLR2004", "S101", "TID252"]
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
minversion = "7.3"
|
||||
testpaths = ["tests"]
|
||||
pythonpath = [".", "src"]
|
||||
|
||||
[tool.coverage.paths]
|
||||
zimit = ["src/zimit"]
|
||||
tests = ["tests"]
|
||||
|
||||
[tool.coverage.run]
|
||||
source_pkgs = ["zimit"]
|
||||
branch = true
|
||||
parallel = true
|
||||
omit = [
|
||||
"src/zimit/__about__.py",
|
||||
]
|
||||
|
||||
[tool.coverage.report]
|
||||
exclude_lines = [
|
||||
"no cov",
|
||||
"if __name__ == .__main__.:",
|
||||
"if TYPE_CHECKING:",
|
||||
]
|
||||
|
||||
[tool.pyright]
|
||||
include = ["src", "tests", "tasks.py"]
|
||||
exclude = [".env/**", ".venv/**"]
|
||||
extraPaths = ["src"]
|
||||
pythonVersion = "3.12"
|
||||
typeCheckingMode="basic"
|
1
src/zimit/__about__.py
Normal file
1
src/zimit/__about__.py
Normal file
@ -0,0 +1 @@
|
||||
__version__ = "2.0.0-dev4"
|
@ -1,7 +1,3 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# vim: ai ts=4 sts=4 et sw=4 nu
|
||||
|
||||
"""
|
||||
Main zimit run script
|
||||
This script validates arguments with warc2zim, checks permissions
|
||||
@ -11,6 +7,7 @@ and then calls the Node based driver
|
||||
import atexit
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
import shutil
|
||||
import signal
|
||||
import subprocess
|
||||
@ -23,19 +20,24 @@ from pathlib import Path
|
||||
|
||||
import inotify
|
||||
import inotify.adapters
|
||||
import requests
|
||||
from tld import get_fld
|
||||
from warc2zim.main import main as warc2zim
|
||||
from zimscraperlib.logging import getLogger
|
||||
from zimscraperlib.uri import rebuild_uri
|
||||
|
||||
DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15"
|
||||
from zimit.__about__ import __version__
|
||||
|
||||
EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
|
||||
EXIT_CODE_CRAWLER_LIMIT_HIT = 11
|
||||
NORMAL_WARC2ZIM_EXIT_CODE = 100
|
||||
|
||||
logger = getLogger(name="zimit", level=logging.INFO)
|
||||
|
||||
|
||||
class ProgressFileWatcher:
|
||||
def __init__(self, output_dir, stats_path):
|
||||
def __init__(self, output_dir: Path, stats_path: Path):
|
||||
self.crawl_path = output_dir / "crawl.json"
|
||||
self.warc2zim_path = output_dir / "warc2zim.json"
|
||||
self.stats_path = Path(stats_path)
|
||||
self.stats_path = stats_path
|
||||
|
||||
if not self.stats_path.is_absolute():
|
||||
self.stats_path = output_dir / self.stats_path
|
||||
@ -46,6 +48,8 @@ class ProgressFileWatcher:
|
||||
self.process = None
|
||||
|
||||
def stop(self):
|
||||
if not self.process:
|
||||
return
|
||||
self.process.join(0.1)
|
||||
self.process.terminate()
|
||||
|
||||
@ -58,10 +62,10 @@ class ProgressFileWatcher:
|
||||
self.process.start()
|
||||
|
||||
@staticmethod
|
||||
def inotify_watcher(crawl_fpath, warc2zim_fpath, output_fpath):
|
||||
def inotify_watcher(crawl_fpath: str, warc2zim_fpath: str, output_fpath: str):
|
||||
ino = inotify.adapters.Inotify()
|
||||
ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY)
|
||||
ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY)
|
||||
ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY) # pyright: ignore
|
||||
ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY) # pyright: ignore
|
||||
|
||||
class Limit:
|
||||
def __init__(self):
|
||||
@ -97,15 +101,15 @@ class ProgressFileWatcher:
|
||||
"limit": limit.as_dict,
|
||||
}
|
||||
|
||||
for _, _, fpath, _ in ino.event_gen(yield_nones=False):
|
||||
for _, _, fpath, _ in ino.event_gen(yield_nones=False): # pyright: ignore
|
||||
func = {crawl_fpath: crawl_conv, warc2zim_fpath: warc2zim_conv}.get(fpath)
|
||||
if not func:
|
||||
continue
|
||||
# open input and output separatly as to not clear output on error
|
||||
with open(fpath, "r") as ifh:
|
||||
with open(fpath) as ifh:
|
||||
try:
|
||||
out = func(json.load(ifh), limit)
|
||||
except Exception: # nosec
|
||||
except Exception: # nosec # noqa: S112
|
||||
# simply ignore progress update should an error arise
|
||||
# might be malformed input for instance
|
||||
continue
|
||||
@ -115,7 +119,7 @@ class ProgressFileWatcher:
|
||||
json.dump(out, ofh)
|
||||
|
||||
|
||||
def zimit(args=None):
|
||||
def run(raw_args):
|
||||
wait_until_options = ["load", "domcontentloaded", "networkidle"]
|
||||
wait_until_all = wait_until_options + [
|
||||
f"{a},{b}" for a, b in itertools.combinations(wait_until_options, 2)
|
||||
@ -131,7 +135,7 @@ def zimit(args=None):
|
||||
|
||||
parser.add_argument(
|
||||
"--urlFile",
|
||||
help="If set, read a list of seed urls, " "one per line, from the specified",
|
||||
help="If set, read a list of seed urls, one per line, from the specified",
|
||||
)
|
||||
|
||||
parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers")
|
||||
@ -205,7 +209,8 @@ def zimit(args=None):
|
||||
|
||||
parser.add_argument(
|
||||
"--lang",
|
||||
help="if set, sets the language used by the browser, should be ISO 639 language[-country] code",
|
||||
help="if set, sets the language used by the browser, should be ISO 639 "
|
||||
"language[-country] code",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
@ -220,12 +225,21 @@ def zimit(args=None):
|
||||
help="Emulate mobile device by name from "
|
||||
"https://github.com/puppeteer/puppeteer/blob/"
|
||||
"main/packages/puppeteer-core/src/common/Device.ts",
|
||||
default="Pixel 2",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--noMobileDevice",
|
||||
help="Do not emulate a mobile device (use at your own risk, behavior is"
|
||||
"uncertain)",
|
||||
action="store_true",
|
||||
default=False,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--userAgent",
|
||||
help="Override default user-agent with specified value ; --userAgentSuffix is still applied",
|
||||
default=DEFAULT_USER_AGENT,
|
||||
help="Override default user-agent with specified value ; --userAgentSuffix and "
|
||||
"--adminEmail have no effect when this is set",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
@ -333,7 +347,38 @@ def zimit(args=None):
|
||||
"to configure the crawling behaviour if not set via argument.",
|
||||
)
|
||||
|
||||
zimit_args, warc2zim_args = parser.parse_known_args(args)
|
||||
parser.add_argument(
|
||||
"--version",
|
||||
help="Display scraper version and exit",
|
||||
action="version",
|
||||
version=f"Zimit {__version__}",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--logging",
|
||||
help="Crawler logging configuration",
|
||||
)
|
||||
|
||||
zimit_args, warc2zim_args = parser.parse_known_args(raw_args)
|
||||
|
||||
logger.info("Checking browsertrix-crawler version")
|
||||
crawl_version_cmd = ["crawl", "--version"]
|
||||
try:
|
||||
crawl = subprocess.run(
|
||||
crawl_version_cmd, check=True, capture_output=True, text=True
|
||||
)
|
||||
except Exception:
|
||||
logger.error("Failed to get Browsertrix crawler version")
|
||||
raise
|
||||
crawler_version = crawl.stdout.strip()
|
||||
logger.info(f"Browsertrix crawler: version {crawler_version}")
|
||||
|
||||
# pass a scraper suffix to warc2zim so that both zimit, warc2zim and crawler
|
||||
# versions are associated with the ZIM
|
||||
warc2zim_args.append("--scraper-suffix")
|
||||
warc2zim_args.append(
|
||||
f" + zimit {__version__} + Browsertrix crawler {crawler_version}"
|
||||
)
|
||||
|
||||
# pass url and output to warc2zim also
|
||||
if zimit_args.output:
|
||||
@ -342,14 +387,12 @@ def zimit(args=None):
|
||||
|
||||
url = zimit_args.url
|
||||
|
||||
user_agent = zimit_args.userAgent
|
||||
if zimit_args.userAgentSuffix:
|
||||
user_agent += f" {zimit_args.userAgentSuffix}"
|
||||
user_agent_suffix = zimit_args.userAgentSuffix
|
||||
if zimit_args.adminEmail:
|
||||
user_agent += f" {zimit_args.adminEmail}"
|
||||
user_agent_suffix += f" {zimit_args.adminEmail}"
|
||||
|
||||
if url:
|
||||
url = check_url(url, user_agent, zimit_args.scopeType)
|
||||
url = get_cleaned_url(url)
|
||||
warc2zim_args.append("--url")
|
||||
warc2zim_args.append(url)
|
||||
|
||||
@ -372,13 +415,13 @@ def zimit(args=None):
|
||||
warc2zim_args.append("--lang")
|
||||
warc2zim_args.append(zimit_args.zim_lang)
|
||||
|
||||
print("----------")
|
||||
print("Testing warc2zim args")
|
||||
print("Running: warc2zim " + " ".join(warc2zim_args), flush=True)
|
||||
logger.info("----------")
|
||||
logger.info("Testing warc2zim args")
|
||||
logger.info("Running: warc2zim " + " ".join(warc2zim_args))
|
||||
res = warc2zim(warc2zim_args)
|
||||
if res != 100:
|
||||
print("Exiting, invalid warc2zim params")
|
||||
return 2
|
||||
if res != NORMAL_WARC2ZIM_EXIT_CODE:
|
||||
logger.info("Exiting, invalid warc2zim params")
|
||||
return EXIT_CODE_WARC2ZIM_CHECK_FAILED
|
||||
|
||||
# make temp dir for this crawl
|
||||
if zimit_args.build:
|
||||
@ -389,9 +432,9 @@ def zimit(args=None):
|
||||
if not zimit_args.keep:
|
||||
|
||||
def cleanup():
|
||||
print("")
|
||||
print("----------")
|
||||
print(f"Cleanup, removing temp dir: {temp_root_dir}", flush=True)
|
||||
logger.info("")
|
||||
logger.info("----------")
|
||||
logger.info(f"Cleanup, removing temp dir: {temp_root_dir}")
|
||||
shutil.rmtree(temp_root_dir)
|
||||
|
||||
atexit.register(cleanup)
|
||||
@ -401,8 +444,12 @@ def zimit(args=None):
|
||||
cmd_args.append("--url")
|
||||
cmd_args.append(url)
|
||||
|
||||
cmd_args.append("--userAgent")
|
||||
cmd_args.append(user_agent)
|
||||
cmd_args.append("--userAgentSuffix")
|
||||
cmd_args.append(user_agent_suffix)
|
||||
|
||||
if not zimit_args.noMobileDevice:
|
||||
cmd_args.append("--mobileDevice")
|
||||
cmd_args.append(zimit_args.mobileDevice)
|
||||
|
||||
cmd_args.append("--cwd")
|
||||
cmd_args.append(str(temp_root_dir))
|
||||
@ -412,7 +459,7 @@ def zimit(args=None):
|
||||
watcher = ProgressFileWatcher(
|
||||
Path(zimit_args.output), Path(zimit_args.statsFilename)
|
||||
)
|
||||
print(f"Writing progress to {watcher.stats_path}")
|
||||
logger.info(f"Writing progress to {watcher.stats_path}")
|
||||
# update crawler command
|
||||
cmd_args.append("--statsFilename")
|
||||
cmd_args.append(str(watcher.crawl_path))
|
||||
@ -424,15 +471,16 @@ def zimit(args=None):
|
||||
|
||||
cmd_line = " ".join(cmd_args)
|
||||
|
||||
print("")
|
||||
print("----------")
|
||||
print(
|
||||
f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}"
|
||||
logger.info("")
|
||||
logger.info("----------")
|
||||
logger.info(
|
||||
f"Output to tempdir: {temp_root_dir} - "
|
||||
f"{'will keep' if zimit_args.keep else 'will delete'}"
|
||||
)
|
||||
print(f"Running browsertrix-crawler crawl: {cmd_line}", flush=True)
|
||||
crawl = subprocess.run(cmd_args)
|
||||
if crawl.returncode == 11:
|
||||
print("crawl interupted by a limit")
|
||||
logger.info(f"Running browsertrix-crawler crawl: {cmd_line}")
|
||||
crawl = subprocess.run(cmd_args, check=False)
|
||||
if crawl.returncode == EXIT_CODE_CRAWLER_LIMIT_HIT:
|
||||
logger.info("crawl interupted by a limit")
|
||||
elif crawl.returncode != 0:
|
||||
raise subprocess.CalledProcessError(crawl.returncode, cmd_args)
|
||||
|
||||
@ -447,65 +495,33 @@ def zimit(args=None):
|
||||
"Failed to find directory where WARC files have been created"
|
||||
)
|
||||
elif len(warc_dirs) > 1:
|
||||
print("Found many WARC files directories, only last one will be used")
|
||||
logger.info("Found many WARC files directories, only last one will be used")
|
||||
for directory in warc_dirs:
|
||||
print(f"- {directory}")
|
||||
logger.info(f"- {directory}")
|
||||
warc_directory = warc_dirs[-1]
|
||||
|
||||
print("")
|
||||
print("----------")
|
||||
print(f"Processing WARC files in {warc_directory}")
|
||||
logger.info("")
|
||||
logger.info("----------")
|
||||
logger.info(f"Processing WARC files in {warc_directory}")
|
||||
warc2zim_args.append(str(warc_directory))
|
||||
|
||||
num_files = sum(1 for _ in warc_directory.iterdir())
|
||||
print(f"{num_files} WARC files found", flush=True)
|
||||
print(f"Calling warc2zim with these args: {warc2zim_args}", flush=True)
|
||||
logger.info(f"{num_files} WARC files found")
|
||||
logger.info(f"Calling warc2zim with these args: {warc2zim_args}")
|
||||
|
||||
return warc2zim(warc2zim_args)
|
||||
|
||||
|
||||
def check_url(url, user_agent, scope=None):
|
||||
url = urllib.parse.urlparse(url)
|
||||
try:
|
||||
with requests.get(
|
||||
url.geturl(),
|
||||
stream=True,
|
||||
allow_redirects=True,
|
||||
timeout=(12.2, 27),
|
||||
headers={"User-Agent": user_agent},
|
||||
) as resp:
|
||||
resp.raise_for_status()
|
||||
except requests.exceptions.RequestException as exc:
|
||||
print(f"failed to connect to {url.geturl()}: {exc}", flush=True)
|
||||
raise SystemExit(1)
|
||||
actual_url = urllib.parse.urlparse(resp.url)
|
||||
def get_cleaned_url(url: str):
|
||||
parsed_url = urllib.parse.urlparse(url)
|
||||
|
||||
# remove explicit port in URI for default-for-scheme as browsers does it
|
||||
if actual_url.scheme == "https" and actual_url.port == 443:
|
||||
actual_url = rebuild_uri(actual_url, port="")
|
||||
if actual_url.scheme == "http" and actual_url.port == 80:
|
||||
actual_url = rebuild_uri(actual_url, port="")
|
||||
if parsed_url.scheme == "https" and parsed_url.port == 443: # noqa: PLR2004
|
||||
parsed_url = rebuild_uri(parsed_url, port="")
|
||||
if parsed_url.scheme == "http" and parsed_url.port == 80: # noqa: PLR2004
|
||||
parsed_url = rebuild_uri(parsed_url, port="")
|
||||
|
||||
if actual_url.geturl() != url.geturl():
|
||||
if scope in (None, "any"):
|
||||
return actual_url.geturl()
|
||||
|
||||
print(
|
||||
"[WARN] Your URL ({0}) redirects to {1} which {2} on same "
|
||||
"first-level domain. Depending on your scopeType ({3}), "
|
||||
"your homepage might be out-of-scope. Please check!".format(
|
||||
url.geturl(),
|
||||
actual_url.geturl(),
|
||||
"is"
|
||||
if get_fld(url.geturl()) == get_fld(actual_url.geturl())
|
||||
else "is not",
|
||||
scope,
|
||||
)
|
||||
)
|
||||
|
||||
return actual_url.geturl()
|
||||
|
||||
return url.geturl()
|
||||
return parsed_url.geturl()
|
||||
|
||||
|
||||
def get_node_cmd_line(args):
|
||||
@ -527,7 +543,7 @@ def get_node_cmd_line(args):
|
||||
"collection",
|
||||
"allowHashUrls",
|
||||
"lang",
|
||||
"mobileDevice",
|
||||
"userAgent",
|
||||
"useSitemap",
|
||||
"behaviors",
|
||||
"behaviorTimeout",
|
||||
@ -539,9 +555,10 @@ def get_node_cmd_line(args):
|
||||
"healthCheckPort",
|
||||
"overwrite",
|
||||
"config",
|
||||
"logging",
|
||||
]:
|
||||
value = getattr(args, arg)
|
||||
if value == None or (isinstance(value, bool) and value == False):
|
||||
if value is None or (isinstance(value, bool) and value is False):
|
||||
continue
|
||||
node_cmd.append("--" + arg)
|
||||
if not isinstance(value, bool):
|
||||
@ -550,17 +567,22 @@ def get_node_cmd_line(args):
|
||||
return node_cmd
|
||||
|
||||
|
||||
def sigint_handler(*args):
|
||||
print("")
|
||||
print("")
|
||||
print("SIGINT/SIGTERM received, stopping zimit")
|
||||
print("")
|
||||
print("", flush=True)
|
||||
def sigint_handler(*args): # noqa: ARG001
|
||||
logger.info("")
|
||||
logger.info("")
|
||||
logger.info("SIGINT/SIGTERM received, stopping zimit")
|
||||
logger.info("")
|
||||
logger.info("")
|
||||
sys.exit(3)
|
||||
|
||||
|
||||
def zimit():
|
||||
run(sys.argv[1:])
|
||||
|
||||
|
||||
signal.signal(signal.SIGINT, sigint_handler)
|
||||
signal.signal(signal.SIGTERM, sigint_handler)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
zimit()
|
109
tasks.py
Normal file
109
tasks.py
Normal file
@ -0,0 +1,109 @@
|
||||
# pyright: strict, reportUntypedFunctionDecorator=false
|
||||
import os
|
||||
|
||||
from invoke.context import Context
|
||||
from invoke.tasks import task # pyright: ignore [reportUnknownVariableType]
|
||||
|
||||
use_pty = not os.getenv("CI", "")
|
||||
|
||||
|
||||
@task(optional=["args"], help={"args": "pytest additional arguments"})
|
||||
def test(ctx: Context, args: str = ""):
|
||||
"""run tests (without coverage)"""
|
||||
ctx.run(f"pytest {args}", pty=use_pty)
|
||||
|
||||
|
||||
@task(optional=["args"], help={"args": "pytest additional arguments"})
|
||||
def test_cov(ctx: Context, args: str = ""):
|
||||
"""run test vith coverage"""
|
||||
ctx.run(f"coverage run -m pytest {args}", pty=use_pty)
|
||||
|
||||
|
||||
@task(optional=["html"], help={"html": "flag to export html report"})
|
||||
def report_cov(ctx: Context, *, html: bool = False):
|
||||
"""report coverage"""
|
||||
ctx.run("coverage combine", warn=True, pty=use_pty)
|
||||
ctx.run("coverage report --show-missing", pty=use_pty)
|
||||
if html:
|
||||
ctx.run("coverage html", pty=use_pty)
|
||||
|
||||
|
||||
@task(
|
||||
optional=["args", "html"],
|
||||
help={
|
||||
"args": "pytest additional arguments",
|
||||
"html": "flag to export html report",
|
||||
},
|
||||
)
|
||||
def coverage(ctx: Context, args: str = "", *, html: bool = False):
|
||||
"""run tests and report coverage"""
|
||||
test_cov(ctx, args=args)
|
||||
report_cov(ctx, html=html)
|
||||
|
||||
|
||||
@task(optional=["args"], help={"args": "black additional arguments"})
|
||||
def lint_black(ctx: Context, args: str = "."):
|
||||
args = args or "." # needed for hatch script
|
||||
ctx.run("black --version", pty=use_pty)
|
||||
ctx.run(f"black --check --diff {args}", pty=use_pty)
|
||||
|
||||
|
||||
@task(optional=["args"], help={"args": "ruff additional arguments"})
|
||||
def lint_ruff(ctx: Context, args: str = "."):
|
||||
args = args or "." # needed for hatch script
|
||||
ctx.run("ruff --version", pty=use_pty)
|
||||
ctx.run(f"ruff check {args}", pty=use_pty)
|
||||
|
||||
|
||||
@task(
|
||||
optional=["args"],
|
||||
help={
|
||||
"args": "linting tools (black, ruff) additional arguments, typically a path",
|
||||
},
|
||||
)
|
||||
def lintall(ctx: Context, args: str = "."):
|
||||
"""Check linting"""
|
||||
args = args or "." # needed for hatch script
|
||||
lint_black(ctx, args)
|
||||
lint_ruff(ctx, args)
|
||||
|
||||
|
||||
@task(optional=["args"], help={"args": "check tools (pyright) additional arguments"})
|
||||
def check_pyright(ctx: Context, args: str = ""):
|
||||
"""check static types with pyright"""
|
||||
ctx.run("pyright --version")
|
||||
ctx.run(f"pyright {args}", pty=use_pty)
|
||||
|
||||
|
||||
@task(optional=["args"], help={"args": "check tools (pyright) additional arguments"})
|
||||
def checkall(ctx: Context, args: str = ""):
|
||||
"""check static types"""
|
||||
check_pyright(ctx, args)
|
||||
|
||||
|
||||
@task(optional=["args"], help={"args": "black additional arguments"})
|
||||
def fix_black(ctx: Context, args: str = "."):
|
||||
"""fix black formatting"""
|
||||
args = args or "." # needed for hatch script
|
||||
ctx.run(f"black {args}", pty=use_pty)
|
||||
|
||||
|
||||
@task(optional=["args"], help={"args": "ruff additional arguments"})
|
||||
def fix_ruff(ctx: Context, args: str = "."):
|
||||
"""fix all ruff rules"""
|
||||
args = args or "." # needed for hatch script
|
||||
ctx.run(f"ruff check --fix {args}", pty=use_pty)
|
||||
|
||||
|
||||
@task(
|
||||
optional=["args"],
|
||||
help={
|
||||
"args": "linting tools (black, ruff) additional arguments, typically a path",
|
||||
},
|
||||
)
|
||||
def fixall(ctx: Context, args: str = "."):
|
||||
"""Fix everything automatically"""
|
||||
args = args or "." # needed for hatch script
|
||||
fix_black(ctx, args)
|
||||
fix_ruff(ctx, args)
|
||||
lintall(ctx, args)
|
1
tests-integration/README.md
Normal file
1
tests-integration/README.md
Normal file
@ -0,0 +1 @@
|
||||
These are integration tests, meant to be ran inside the CI (because we need to first perform a zimit run on a given website and then check its output)
|
@ -1,14 +1,9 @@
|
||||
import os
|
||||
import glob
|
||||
import json
|
||||
import os
|
||||
|
||||
import libzim.reader
|
||||
from warcio import ArchiveIterator
|
||||
|
||||
|
||||
def get_zim_article(zimfile, path):
|
||||
zim_fh = libzim.reader.Archive(zimfile)
|
||||
return zim_fh.get_entry_by_path(path).get_item().content.tobytes()
|
||||
from zimscraperlib.zim import Archive
|
||||
|
||||
|
||||
def test_is_file():
|
||||
@ -20,20 +15,34 @@ def test_zim_main_page():
|
||||
"""Main page specified, http://isago.rskg.org/, was a redirect to https
|
||||
Ensure main page is the redirected page"""
|
||||
|
||||
assert b'"https://isago.rskg.org/"' in get_zim_article(
|
||||
"/output/isago.zim", "A/index.html"
|
||||
)
|
||||
main_entry = Archive("/output/isago.zim").main_entry
|
||||
assert main_entry.is_redirect
|
||||
assert main_entry.get_redirect_entry().path == "isago.rskg.org/"
|
||||
|
||||
|
||||
def test_zim_scraper():
|
||||
"""Main page specified, http://isago.rskg.org/, was a redirect to https
|
||||
Ensure main page is the redirected page"""
|
||||
|
||||
zim_fh = Archive("/output/isago.zim")
|
||||
scraper = zim_fh.get_text_metadata("Scraper")
|
||||
assert "zimit " in scraper
|
||||
assert "warc2zim " in scraper
|
||||
assert "Browsertrix crawler " in scraper
|
||||
|
||||
|
||||
def test_user_agent():
|
||||
"""Test that mobile user agent was used in WARC request records with custom Zimit and email suffix"""
|
||||
"""Test that mobile user agent was used
|
||||
|
||||
Check is done in WARC request records with custom Zimit and email suffix
|
||||
"""
|
||||
|
||||
found = False
|
||||
for warc in glob.glob("/output/.tmp*/collections/crawl-*/archive/*.warc.gz"):
|
||||
with open(warc, "rb") as fh:
|
||||
for record in ArchiveIterator(fh):
|
||||
if record.rec_type == "request":
|
||||
print(record.http_headers)
|
||||
print(record.http_headers) # noqa: T201
|
||||
ua = record.http_headers.get_header("User-Agent")
|
||||
if ua:
|
||||
assert "Mozilla" in ua
|
||||
@ -56,12 +65,12 @@ def test_stats_output():
|
||||
}
|
||||
with open("/output/warc2zim.json") as fh:
|
||||
assert json.loads(fh.read()) == {
|
||||
"written": 8,
|
||||
"total": 8,
|
||||
"written": 7,
|
||||
"total": 7,
|
||||
}
|
||||
with open("/output/stats.json") as fh:
|
||||
assert json.loads(fh.read()) == {
|
||||
"done": 8,
|
||||
"total": 8,
|
||||
"done": 7,
|
||||
"total": 7,
|
||||
"limit": {"max": 0, "hit": False},
|
||||
}
|
6
tests/test_dummy.py
Normal file
6
tests/test_dummy.py
Normal file
@ -0,0 +1,6 @@
|
||||
from zimit.zimit import NORMAL_WARC2ZIM_EXIT_CODE
|
||||
|
||||
|
||||
# dummy test, just to have coverage report done
|
||||
def test_something_exists():
|
||||
assert NORMAL_WARC2ZIM_EXIT_CODE
|
Loading…
x
Reference in New Issue
Block a user