From 1c58bbe3038f582dbec7006d573ed5bd50e73227 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Sun, 14 Jan 2024 12:08:54 +0100 Subject: [PATCH 01/29] Adapt to `warc2zim2` branch of warc2zim. `warc2zim2` branch create zim files without service worker. --- Dockerfile | 2 +- test/integration.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index 298f7ec..d471fb6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,7 +9,7 @@ RUN apt-get update \ # python setup (in venv not to conflict with browsertrix) && python3 -m venv /app/zimit \ && /app/zimit/bin/python -m pip install --no-cache-dir 'requests==2.31.0' 'inotify==0.2.10' 'tld==0.13' \ - 'git+https://github.com/openzim/warc2zim@main#egg_name=warc2zim' \ + 'git+https://github.com/openzim/warc2zim@warc2zim2#egg_name=warc2zim' \ # placeholder (default output location) && mkdir -p /output \ # disable chrome upgrade diff --git a/test/integration.py b/test/integration.py index 1782b34..95f7924 100644 --- a/test/integration.py +++ b/test/integration.py @@ -6,9 +6,9 @@ import libzim.reader from warcio import ArchiveIterator -def get_zim_article(zimfile, path): +def get_zim_main_entry(zimfile): zim_fh = libzim.reader.Archive(zimfile) - return zim_fh.get_entry_by_path(path).get_item().content.tobytes() + return zim_fh.main_entry def test_is_file(): @@ -20,9 +20,9 @@ def test_zim_main_page(): """Main page specified, http://isago.rskg.org/, was a redirect to https Ensure main page is the redirected page""" - assert b'"https://isago.rskg.org/"' in get_zim_article( - "/output/isago.zim", "A/index.html" - ) + main_entry = get_zim_main_entry("/output/isago.zim") + assert main_entry.is_redirect + assert main_entry.get_redirect_entry().path == "isago.rskg.org/" def test_user_agent(): From e034b088521bc7ce053145682da125cd27e06abf Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 15 Jan 2024 08:01:35 +0100 Subject: [PATCH 02/29] Update CHANGELOG --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1a9c7a1..3332923 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,8 +10,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed -- Adapt to new `warc2zim` code structure -- Using `main` warc2zim ⚠️ change before releasing! +- Use `warc2zim` version 2, which works without Service Worker anymore +- Using `warc2zim2` warc2zim ⚠️ change before releasing! ### Added From a352c0c40207e0d625d3d5d980073da87bf1e151 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 15 Jan 2024 08:03:58 +0100 Subject: [PATCH 03/29] Add temporary Github Actions workflow to build zimit2 image --- .github/workflows/docker_zimit2.yml | 31 +++++++++++++++++++++++++++++ CHANGELOG.md | 1 + 2 files changed, 32 insertions(+) create mode 100644 .github/workflows/docker_zimit2.yml diff --git a/.github/workflows/docker_zimit2.yml b/.github/workflows/docker_zimit2.yml new file mode 100644 index 0000000..e611038 --- /dev/null +++ b/.github/workflows/docker_zimit2.yml @@ -0,0 +1,31 @@ +name: Docker Zimit2 + +on: + push: + branches: + - zimit2 + +jobs: + build-and-push: + name: Deploy Docker Image + runs-on: ubuntu-22.04 + + steps: + - name: Retrieve source code + uses: actions/checkout@v3 + + - name: Build and push + uses: openzim/docker-publish-action@v10 + with: + image-name: openzim/zimit + manual-tag: zimit2 + restrict-to: openzim/zimit + registries: ghcr.io + credentials: + GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} + GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} + repo_description: auto + repo_overview: auto + platforms: | + linux/amd64 + linux/arm64 diff --git a/CHANGELOG.md b/CHANGELOG.md index 3332923..1008e29 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Use `warc2zim` version 2, which works without Service Worker anymore - Using `warc2zim2` warc2zim ⚠️ change before releasing! +- Build temporary `zimit2` Docker image for testing ⚠️ remove before releasing! ### Added From 27f9dcc53f86939c5956b0fce7d2836341125fa7 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 15 Jan 2024 17:45:56 +0100 Subject: [PATCH 04/29] Empty commit to release warc2zim2 commit aca2db3 From 343fb7e7704a938b3301db85764583fba80519c9 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 18 Jan 2024 13:23:30 +0100 Subject: [PATCH 05/29] Replace warning about service workers by a nota bene about there removal since 2.x --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index cdf0966..9c928a5 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,6 @@ Zimit is a scraper allowing to create ZIM file from any Web site. [![CodeFactor](https://www.codefactor.io/repository/github/openzim/zimit/badge)](https://www.codefactor.io/repository/github/openzim/zimit) [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) -⚠️ **Important**: this tool uses [warc2zim](https://github.com/openzim/warc2zim) to create Zim files and thus require the Zim reader to support *Service Workers*. At the time of `zimit:1.0`, that's mostly kiwix-android and kiwix-serve. Note that service workers have protocol restrictions as well so you'll need to run it either from `localhost` or over HTTPS. - Technical background -------------------- @@ -68,7 +66,10 @@ default and prints the crawl status to the Docker log. Nota bene --------- -A first version of a generic HTTP scraper was created in 2016 during +While Zimit 1.x relied on a Service Worker to display the ZIM content, this is not anymore the case +since Zimit 2.x which does not have any special requirements anymore. + +It should also be noted that a first version of a generic HTTP scraper was created in 2016 during the [Wikimania Esino Lario Hackathon](https://wikimania2016.wikimedia.org/wiki/Programme/Kiwix-dedicated_Hackathon). From c0ffb74d8c7790b0776adc5064184c483d19f990 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 18 Jan 2024 13:27:55 +0100 Subject: [PATCH 06/29] Adopt Python bootstrap conventions --- .dockerignore | 2 - .github/workflows/{docker.yml => Publish.yml} | 18 +- .github/workflows/PublishDockerDevImage.yaml | 30 +++ ...mit2.yml => PublishDockerZimit2Image.yaml} | 11 +- .github/workflows/QA.yaml | 34 +++ .github/workflows/Tests.yaml | 66 +++++ .github/workflows/ci.yaml | 20 -- .pre-commit-config.yaml | 27 ++ CHANGELOG.md | 6 +- Dockerfile | 67 ++--- README.md | 5 +- pyproject.toml | 233 ++++++++++++++++++ src/zimit/__about__.py | 1 + zimit.py => src/zimit/zimit.py | 156 +++++++----- tasks.py | 109 ++++++++ tests-integration/README.md | 1 + {test => tests-integration}/integration.py | 9 +- tests/test_dummy.py | 6 + 18 files changed, 661 insertions(+), 140 deletions(-) delete mode 100644 .dockerignore rename .github/workflows/{docker.yml => Publish.yml} (69%) create mode 100644 .github/workflows/PublishDockerDevImage.yaml rename .github/workflows/{docker_zimit2.yml => PublishDockerZimit2Image.yaml} (76%) create mode 100644 .github/workflows/QA.yaml create mode 100644 .github/workflows/Tests.yaml delete mode 100644 .github/workflows/ci.yaml create mode 100644 .pre-commit-config.yaml create mode 100644 pyproject.toml create mode 100644 src/zimit/__about__.py rename zimit.py => src/zimit/zimit.py (79%) create mode 100644 tasks.py create mode 100644 tests-integration/README.md rename {test => tests-integration}/integration.py (90%) create mode 100644 tests/test_dummy.py diff --git a/.dockerignore b/.dockerignore deleted file mode 100644 index e1d45cf..0000000 --- a/.dockerignore +++ /dev/null @@ -1,2 +0,0 @@ -output/ -node_modules/ diff --git a/.github/workflows/docker.yml b/.github/workflows/Publish.yml similarity index 69% rename from .github/workflows/docker.yml rename to .github/workflows/Publish.yml index e75cd9a..8399a83 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/Publish.yml @@ -1,26 +1,20 @@ -name: Docker +name: Publish released version on: - push: - branches: - - main - tags: - - v* + release: + types: [published] jobs: - build-and-push: - name: Deploy Docker Image + publish: runs-on: ubuntu-22.04 steps: - - name: Retrieve source code - uses: actions/checkout@v3 + - uses: actions/checkout@v3 - - name: Build and push + - name: Build and push Docker image uses: openzim/docker-publish-action@v10 with: image-name: openzim/zimit - on-master: dev tag-pattern: /^v([0-9.]+)$/ latest-on-tag: true restrict-to: openzim/zimit diff --git a/.github/workflows/PublishDockerDevImage.yaml b/.github/workflows/PublishDockerDevImage.yaml new file mode 100644 index 0000000..18e7abf --- /dev/null +++ b/.github/workflows/PublishDockerDevImage.yaml @@ -0,0 +1,30 @@ +name: Publish Docker dev image + +on: + push: + branches: + - main + +jobs: + publish: + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v3 + + - name: Build and push Docker image + uses: openzim/docker-publish-action@v10 + with: + image-name: openzim/zimit + manual-tag: dev + latest-on-tag: false + restrict-to: openzim/zimit + registries: ghcr.io + credentials: + GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} + GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} + repo_description: auto + repo_overview: auto + platforms: | + linux/amd64 + linux/arm64 diff --git a/.github/workflows/docker_zimit2.yml b/.github/workflows/PublishDockerZimit2Image.yaml similarity index 76% rename from .github/workflows/docker_zimit2.yml rename to .github/workflows/PublishDockerZimit2Image.yaml index e611038..45d82da 100644 --- a/.github/workflows/docker_zimit2.yml +++ b/.github/workflows/PublishDockerZimit2Image.yaml @@ -1,4 +1,4 @@ -name: Docker Zimit2 +name: Publish Docker zimit2 image on: push: @@ -6,19 +6,18 @@ on: - zimit2 jobs: - build-and-push: - name: Deploy Docker Image + publish: runs-on: ubuntu-22.04 steps: - - name: Retrieve source code - uses: actions/checkout@v3 + - uses: actions/checkout@v3 - - name: Build and push + - name: Build and push Docker image uses: openzim/docker-publish-action@v10 with: image-name: openzim/zimit manual-tag: zimit2 + latest-on-tag: false restrict-to: openzim/zimit registries: ghcr.io credentials: diff --git a/.github/workflows/QA.yaml b/.github/workflows/QA.yaml new file mode 100644 index 0000000..48ccee5 --- /dev/null +++ b/.github/workflows/QA.yaml @@ -0,0 +1,34 @@ +name: QA + +on: + pull_request: + push: + branches: + - main + +jobs: + check-qa: + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version-file: pyproject.toml + architecture: x64 + + - name: Install dependencies (and project) + run: | + pip install -U pip + pip install -e .[lint,scripts,test,check] + + - name: Check black formatting + run: inv lint-black + + - name: Check ruff + run: inv lint-ruff + + - name: Check pyright + run: inv check-pyright diff --git a/.github/workflows/Tests.yaml b/.github/workflows/Tests.yaml new file mode 100644 index 0000000..4d097b2 --- /dev/null +++ b/.github/workflows/Tests.yaml @@ -0,0 +1,66 @@ +name: Tests + +on: + pull_request: + push: + branches: + - main + +jobs: + run-tests: + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version-file: pyproject.toml + architecture: x64 + + - name: Install dependencies (and project) + run: | + pip install -U pip + pip install -e .[test,scripts] + + - name: Run the tests + run: inv coverage --args "-vvv" + + - name: Upload coverage report to codecov + uses: codecov/codecov-action@v3 + with: + token: ${{ secrets.CODECOV_TOKEN }} + + build_python: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version-file: pyproject.toml + architecture: x64 + + - name: Ensure we can build Python targets + run: | + pip install -U pip build + python3 -m build --sdist --wheel + + # this job replaces the standard "build_docker" job since it builds the docker image + run-integration-tests: + runs-on: ubuntu-22.04 + + steps: + - name: checkout + uses: actions/checkout@v3 + + - name: build image + run: docker build -t zimit . + + - name: run crawl + run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep + + - name: run integration test suite + run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py" diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml deleted file mode 100644 index 8083581..0000000 --- a/.github/workflows/ci.yaml +++ /dev/null @@ -1,20 +0,0 @@ -name: CI - -on: push - -jobs: - integration-tests: - runs-on: ubuntu-22.04 - - steps: - - name: checkout - uses: actions/checkout@v3 - - - name: build image - run: docker build -t zimit . - - - name: run crawl - run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep - - - name: run integration test suite - run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v ./integration.py" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..42d7ca3 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,27 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer +- repo: https://github.com/psf/black + rev: "23.12.1" + hooks: + - id: black +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.1.3 + hooks: + - id: ruff +- repo: https://github.com/RobertCraigie/pyright-python + rev: v1.1.347 + hooks: + - id: pyright + name: pyright (system) + description: 'pyright static type checker' + entry: pyright + language: system + 'types_or': [python, pyi] + require_serial: true + minimum_pre_commit_version: '2.9.2' diff --git a/CHANGELOG.md b/CHANGELOG.md index 894c625..5d52f42 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,12 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- New `--version` flag to display Zimit version ### Changed - Use `warc2zim` version 2, which works without Service Worker anymore - Using `warc2zim2` warc2zim ⚠️ change before releasing! - Build temporary `zimit2` Docker image for testing ⚠️ remove before releasing! +- Adopt Python bootstrap conventions ## [1.6.3] - 2024-01-18 @@ -156,7 +160,7 @@ if `--keep` is set. ### Changed - using browsertrix-crawler `0.6.0` and warc2zim `1.4.2` -- default WARC location after crawl changed +- default WARC location after crawl changed from `collections/capture-*/archive/` to `collections/crawl-*/archive/` ### Removed diff --git a/Dockerfile b/Dockerfile index 1952c75..3435fbf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,35 +2,44 @@ FROM webrecorder/browsertrix-crawler:0.12.4 LABEL org.opencontainers.image.source https://github.com/openzim/zimit RUN apt-get update \ - && apt-get install -qqy --no-install-recommends \ - libmagic1 \ - python3.10-venv \ - && rm -rf /var/lib/apt/lists/* \ - # python setup (in venv not to conflict with browsertrix) - && python3 -m venv /app/zimit \ - && /app/zimit/bin/python -m pip install --no-cache-dir 'requests==2.31.0' 'inotify==0.2.10' 'tld==0.13' \ - 'git+https://github.com/openzim/warc2zim@warc2zim2#egg_name=warc2zim' \ - # placeholder (default output location) - && mkdir -p /output \ - # disable chrome upgrade - && printf "repo_add_once=\"false\"\nrepo_reenable_on_distupgrade=\"false\"\n" > /etc/default/google-chrome \ - # download list of bad domains to filter-out. intentionnaly ran post-install \ - # so it's not cached in earlier layers (url stays same but content updated) \ - mkdir -p /tmp/ads && cd /tmp/ads && \ - curl -L -O https://hosts.anudeep.me/mirror/adservers.txt && \ - curl -L -O https://hosts.anudeep.me/mirror/CoinMiner.txt && \ - curl -L -O https://hosts.anudeep.me/mirror/facebook.txt && \ - cat ./*.txt > /etc/blocklist.txt \ - && rm ./*.txt \ - && printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh && \ - chmod +x /usr/local/bin/entrypoint.sh + && apt-get install -qqy --no-install-recommends \ + libmagic1 \ + python3.11-venv \ + && rm -rf /var/lib/apt/lists/* \ + # python setup (in venv not to conflict with browsertrix) + && python3.11 -m venv /app/zimit \ + # placeholder (default output location) + && mkdir -p /output \ + # disable chrome upgrade + && printf "repo_add_once=\"false\"\nrepo_reenable_on_distupgrade=\"false\"\n" > /etc/default/google-chrome \ + # download list of bad domains to filter-out. intentionnaly ran post-install \ + # so it's not cached in earlier layers (url stays same but content updated) \ + && mkdir -p /tmp/ads \ + && cd /tmp/ads \ + && curl -L -O https://hosts.anudeep.me/mirror/adservers.txt \ + && curl -L -O https://hosts.anudeep.me/mirror/CoinMiner.txt \ + && curl -L -O https://hosts.anudeep.me/mirror/facebook.txt \ + && cat ./*.txt > /etc/blocklist.txt \ + && rm ./*.txt \ + && printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh \ + && chmod +x /usr/local/bin/entrypoint.sh -WORKDIR /app -ADD zimit.py /app/ -# fix shebang on zimit to use in-venv python -RUN sed -i.bak "1 s/.*/#!\/app\/zimit\/bin\/python3/" /app/zimit.py \ - && ln -s /app/zimit.py /usr/bin/zimit \ - && chmod +x /usr/bin/zimit +# Copy pyproject.toml and its dependencies +COPY pyproject.toml README.md /src/ +COPY src/zimit/__about__.py /src/src/zimit/__about__.py + +# Install Python dependencies +RUN /app/zimit/bin/python -m pip install --no-cache-dir /src + +# Copy code + associated artifacts +COPY src /src/src +COPY *.md /src/ + +# Install + cleanup +RUN /app/zimit/bin/python -m pip install --no-cache-dir /src \ + && ln -s /app/zimit/bin/zimit /usr/bin/zimit \ + && chmod +x /usr/bin/zimit \ + && rm -rf /src ENTRYPOINT ["entrypoint.sh"] -CMD ["zimit"] +CMD ["zimit", "--help"] diff --git a/README.md b/README.md index 9c928a5..cf1a35a 100644 --- a/README.md +++ b/README.md @@ -3,10 +3,9 @@ Zimit Zimit is a scraper allowing to create ZIM file from any Web site. -[![Docker](https://ghcr-badge.deta.dev/openzim/zimit/latest_tag?label=docker)](https://ghcr.io/openzim/zimit) -[![Build](https://github.com/openzim/zimit/workflows/CI/badge.svg?query=branch%3Amain)](https://github.com/openzim/zimit/actions?query=branch%3Amain) [![CodeFactor](https://www.codefactor.io/repository/github/openzim/zimit/badge)](https://www.codefactor.io/repository/github/openzim/zimit) [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) +[![Docker](https://ghcr-badge.deta.dev/openzim/zimit/latest_tag?label=docker)](https://ghcr.io/openzim/zimit) Technical background -------------------- @@ -43,7 +42,7 @@ The image accepts the following parameters, **as well as any of the [warc2zim](h - `--name` - Name of ZIM file (defaults to the hostname of the URL) - `--output` - output directory (defaults to `/output`) - `--limit U` - Limit capture to at most U URLs -- `--exclude ` - skip URLs that match the regex from crawling. Can be specified multiple times. An example is `--exclude="(\?q=|signup-landing\?|\?cid=)"`, where URLs that contain either `?q=` or `signup-landing?` or `?cid=` will be excluded. +- `--exclude ` - skip URLs that match the regex from crawling. Can be specified multiple times. An example is `--exclude="(\?q=|signup-landing\?|\?cid=)"`, where URLs that contain either `?q=` or `signup-landing?` or `?cid=` will be excluded. - `--scroll [N]` - if set, will activate a simple auto-scroll behavior on each page to scroll for upto N seconds - `--keep` - if set, keep the WARC files in a temp directory inside the output directory diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..ad7db1f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,233 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "zimit" +authors = [ + { name = "Kiwix", email = "dev@kiwix.org" }, +] +keywords = ["some"] +requires-python = ">=3.11,<3.12" +description = "Make ZIM file from any website through crawling" +readme = "README.md" +license = {text = "GPL-3.0-or-later"} +classifiers = [ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.11", + "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", +] +dependencies = [ + "requests==2.31.0", + "inotify==0.2.10", + "tld==0.13", + "warc2zim @ git+https://github.com/openzim/warc2zim@warc2zim2", +] +dynamic = ["version"] + +[tool.hatch.metadata] +allow-direct-references = true # to be removed once we use a released warc2zim version + +[project.optional-dependencies] +scripts = [ + "invoke==2.2.0", +] +lint = [ + "black==23.12.1", + "ruff==0.1.3", +] +check = [ + "pyright==1.1.347", +] +test = [ + "pytest==7.4.4", + "coverage==7.4.0", +] +dev = [ + "pre-commit==3.6.0", + "debugpy==1.8.0", + "zimit[scripts]", + "zimit[lint]", + "zimit[test]", + "zimit[check]", +] + +[project.urls] +Homepage = "https://github.com/openzim/zimit" +Donate = "https://www.kiwix.org/en/support-us/" + +[project.scripts] +zimit = "zimit:zimit.zimit" + +[tool.hatch.version] +path = "src/zimit/__about__.py" + +[tool.hatch.build] +exclude = [ + "/.github", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/zimit"] + +[tool.hatch.envs.default] +features = ["dev"] + +[tool.hatch.envs.test] +features = ["scripts", "test"] + +[tool.hatch.envs.test.scripts] +run = "inv test --args '{args}'" +run-cov = "inv test-cov --args '{args}'" +report-cov = "inv report-cov" +coverage = "inv coverage --args '{args}'" +html = "inv coverage --html --args '{args}'" + +[tool.hatch.envs.lint] +template = "lint" +skip-install = false +features = ["scripts", "lint"] + +[tool.hatch.envs.lint.scripts] +black = "inv lint-black --args '{args}'" +ruff = "inv lint-ruff --args '{args}'" +all = "inv lintall --args '{args}'" +fix-black = "inv fix-black --args '{args}'" +fix-ruff = "inv fix-ruff --args '{args}'" +fixall = "inv fixall --args '{args}'" + +[tool.hatch.envs.check] +features = ["scripts", "check"] + +[tool.hatch.envs.check.scripts] +pyright = "inv check-pyright --args '{args}'" +all = "inv checkall --args '{args}'" + +[tool.black] +line-length = 88 +target-version = ['py311'] + +[tool.ruff] +target-version = "py311" +line-length = 88 +src = ["src"] +select = [ + "A", # flake8-builtins + # "ANN", # flake8-annotations + "ARG", # flake8-unused-arguments + # "ASYNC", # flake8-async + "B", # flake8-bugbear + # "BLE", # flake8-blind-except + "C4", # flake8-comprehensions + "C90", # mccabe + # "COM", # flake8-commas + # "D", # pydocstyle + # "DJ", # flake8-django + "DTZ", # flake8-datetimez + "E", # pycodestyle (default) + "EM", # flake8-errmsg + # "ERA", # eradicate + # "EXE", # flake8-executable + "F", # Pyflakes (default) + # "FA", # flake8-future-annotations + "FBT", # flake8-boolean-trap + # "FLY", # flynt + # "G", # flake8-logging-format + "I", # isort + "ICN", # flake8-import-conventions + # "INP", # flake8-no-pep420 + # "INT", # flake8-gettext + "ISC", # flake8-implicit-str-concat + "N", # pep8-naming + # "NPY", # NumPy-specific rules + # "PD", # pandas-vet + # "PGH", # pygrep-hooks + # "PIE", # flake8-pie + # "PL", # Pylint + "PLC", # Pylint: Convention + "PLE", # Pylint: Error + "PLR", # Pylint: Refactor + "PLW", # Pylint: Warning + # "PT", # flake8-pytest-style + # "PTH", # flake8-use-pathlib + # "PYI", # flake8-pyi + "Q", # flake8-quotes + # "RET", # flake8-return + # "RSE", # flake8-raise + "RUF", # Ruff-specific rules + "S", # flake8-bandit + # "SIM", # flake8-simplify + # "SLF", # flake8-self + "T10", # flake8-debugger + "T20", # flake8-print + # "TCH", # flake8-type-checking + # "TD", # flake8-todos + "TID", # flake8-tidy-imports + # "TRY", # tryceratops + "UP", # pyupgrade + "W", # pycodestyle + "YTT", # flake8-2020 +] +ignore = [ + # Allow non-abstract empty methods in abstract base classes + "B027", + # Remove flake8-errmsg since we consider they bloat the code and provide limited value + "EM", + # Allow boolean positional values in function calls, like `dict.get(... True)` + "FBT003", + # Ignore checks for possible passwords + "S105", "S106", "S107", + # Ignore warnings on subprocess.run / popen + "S603", + # Ignore complexity + "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915", +] +unfixable = [ + # Don't touch unused imports + "F401", +] + +[tool.ruff.isort] +known-first-party = ["zimit"] + +[tool.ruff.flake8-bugbear] +# add exceptions to B008 for fastapi. +extend-immutable-calls = ["fastapi.Depends", "fastapi.Query"] + +[tool.ruff.flake8-tidy-imports] +ban-relative-imports = "all" + +[tool.ruff.per-file-ignores] +# Tests can use magic values, assertions, and relative imports +"tests**/**/*" = ["PLR2004", "S101", "TID252"] + +[tool.pytest.ini_options] +minversion = "7.3" +testpaths = ["tests"] +pythonpath = [".", "src"] + +[tool.coverage.paths] +zimit = ["src/zimit"] +tests = ["tests"] + +[tool.coverage.run] +source_pkgs = ["zimit"] +branch = true +parallel = true +omit = [ + "src/zimit/__about__.py", +] + +[tool.coverage.report] +exclude_lines = [ + "no cov", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", +] + +[tool.pyright] +include = ["src", "tests", "tasks.py"] +exclude = [".env/**", ".venv/**"] +extraPaths = ["src"] +pythonVersion = "3.11" +typeCheckingMode="basic" diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py new file mode 100644 index 0000000..494af57 --- /dev/null +++ b/src/zimit/__about__.py @@ -0,0 +1 @@ +__version__ = "2.0.0-dev0" diff --git a/zimit.py b/src/zimit/zimit.py similarity index 79% rename from zimit.py rename to src/zimit/zimit.py index 909decb..e436feb 100755 --- a/zimit.py +++ b/src/zimit/zimit.py @@ -1,7 +1,3 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# vim: ai ts=4 sts=4 et sw=4 nu - """ Main zimit run script This script validates arguments with warc2zim, checks permissions @@ -11,6 +7,7 @@ and then calls the Node based driver import atexit import itertools import json +import logging import shutil import signal import subprocess @@ -26,16 +23,30 @@ import inotify.adapters import requests from tld import get_fld from warc2zim.main import main as warc2zim +from zimscraperlib.logging import getLogger from zimscraperlib.uri import rebuild_uri -DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15" +from zimit.__about__ import __version__ + +DEFAULT_USER_AGENT = ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 " + "(KHTML, like Gecko) Version/17.0 Safari/605.1.15" +) + +EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2 +EXIT_CODE_CRAWLER_LIMIT_HIT = 11 +NORMAL_WARC2ZIM_EXIT_CODE = 100 + +LOGGER_NAME = Path(__file__).parent.name + +logger = getLogger(name=LOGGER_NAME, level=logging.INFO) class ProgressFileWatcher: - def __init__(self, output_dir, stats_path): + def __init__(self, output_dir: Path, stats_path: Path): self.crawl_path = output_dir / "crawl.json" self.warc2zim_path = output_dir / "warc2zim.json" - self.stats_path = Path(stats_path) + self.stats_path = stats_path if not self.stats_path.is_absolute(): self.stats_path = output_dir / self.stats_path @@ -46,6 +57,8 @@ class ProgressFileWatcher: self.process = None def stop(self): + if not self.process: + return self.process.join(0.1) self.process.terminate() @@ -58,10 +71,10 @@ class ProgressFileWatcher: self.process.start() @staticmethod - def inotify_watcher(crawl_fpath, warc2zim_fpath, output_fpath): + def inotify_watcher(crawl_fpath: str, warc2zim_fpath: str, output_fpath: str): ino = inotify.adapters.Inotify() - ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY) - ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY) + ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY) # pyright: ignore + ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY) # pyright: ignore class Limit: def __init__(self): @@ -97,15 +110,15 @@ class ProgressFileWatcher: "limit": limit.as_dict, } - for _, _, fpath, _ in ino.event_gen(yield_nones=False): + for _, _, fpath, _ in ino.event_gen(yield_nones=False): # pyright: ignore func = {crawl_fpath: crawl_conv, warc2zim_fpath: warc2zim_conv}.get(fpath) if not func: continue # open input and output separatly as to not clear output on error - with open(fpath, "r") as ifh: + with open(fpath) as ifh: try: out = func(json.load(ifh), limit) - except Exception: # nosec + except Exception: # nosec # noqa: S112 # simply ignore progress update should an error arise # might be malformed input for instance continue @@ -115,7 +128,7 @@ class ProgressFileWatcher: json.dump(out, ofh) -def zimit(args=None): +def run(raw_args): wait_until_options = ["load", "domcontentloaded", "networkidle"] wait_until_all = wait_until_options + [ f"{a},{b}" for a, b in itertools.combinations(wait_until_options, 2) @@ -131,7 +144,7 @@ def zimit(args=None): parser.add_argument( "--urlFile", - help="If set, read a list of seed urls, " "one per line, from the specified", + help="If set, read a list of seed urls, one per line, from the specified", ) parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers") @@ -205,7 +218,8 @@ def zimit(args=None): parser.add_argument( "--lang", - help="if set, sets the language used by the browser, should be ISO 639 language[-country] code", + help="if set, sets the language used by the browser, should be ISO 639 " + "language[-country] code", ) parser.add_argument( @@ -224,7 +238,8 @@ def zimit(args=None): parser.add_argument( "--userAgent", - help="Override default user-agent with specified value ; --userAgentSuffix is still applied", + help="Override default user-agent with specified value ; --userAgentSuffix is " + "still applied", default=DEFAULT_USER_AGENT, ) @@ -333,7 +348,14 @@ def zimit(args=None): "to configure the crawling behaviour if not set via argument.", ) - zimit_args, warc2zim_args = parser.parse_known_args(args) + parser.add_argument( + "--version", + help="Display scraper version and exit", + action="version", + version=f"Zimit {__version__}", + ) + + zimit_args, warc2zim_args = parser.parse_known_args(raw_args) # pass url and output to warc2zim also if zimit_args.output: @@ -372,13 +394,13 @@ def zimit(args=None): warc2zim_args.append("--lang") warc2zim_args.append(zimit_args.zim_lang) - print("----------") - print("Testing warc2zim args") - print("Running: warc2zim " + " ".join(warc2zim_args), flush=True) + logger.info("----------") + logger.info("Testing warc2zim args") + logger.info("Running: warc2zim " + " ".join(warc2zim_args)) res = warc2zim(warc2zim_args) - if res != 100: - print("Exiting, invalid warc2zim params") - return 2 + if res != NORMAL_WARC2ZIM_EXIT_CODE: + logger.info("Exiting, invalid warc2zim params") + return EXIT_CODE_WARC2ZIM_CHECK_FAILED # make temp dir for this crawl if zimit_args.build: @@ -389,9 +411,9 @@ def zimit(args=None): if not zimit_args.keep: def cleanup(): - print("") - print("----------") - print(f"Cleanup, removing temp dir: {temp_root_dir}", flush=True) + logger.info("") + logger.info("----------") + logger.info(f"Cleanup, removing temp dir: {temp_root_dir}") shutil.rmtree(temp_root_dir) atexit.register(cleanup) @@ -412,7 +434,7 @@ def zimit(args=None): watcher = ProgressFileWatcher( Path(zimit_args.output), Path(zimit_args.statsFilename) ) - print(f"Writing progress to {watcher.stats_path}") + logger.info(f"Writing progress to {watcher.stats_path}") # update crawler command cmd_args.append("--statsFilename") cmd_args.append(str(watcher.crawl_path)) @@ -424,15 +446,16 @@ def zimit(args=None): cmd_line = " ".join(cmd_args) - print("") - print("----------") - print( - f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}" + logger.info("") + logger.info("----------") + logger.info( + f"Output to tempdir: {temp_root_dir} - " + f"{'will keep' if zimit_args.keep else 'will delete'}" ) - print(f"Running browsertrix-crawler crawl: {cmd_line}", flush=True) - crawl = subprocess.run(cmd_args) - if crawl.returncode == 11: - print("crawl interupted by a limit") + logger.info(f"Running browsertrix-crawler crawl: {cmd_line}") + crawl = subprocess.run(cmd_args, check=False) + if crawl.returncode == EXIT_CODE_CRAWLER_LIMIT_HIT: + logger.info("crawl interupted by a limit") elif crawl.returncode != 0: raise subprocess.CalledProcessError(crawl.returncode, cmd_args) @@ -447,28 +470,28 @@ def zimit(args=None): "Failed to find directory where WARC files have been created" ) elif len(warc_dirs) > 1: - print("Found many WARC files directories, only last one will be used") + logger.info("Found many WARC files directories, only last one will be used") for directory in warc_dirs: - print(f"- {directory}") + logger.info(f"- {directory}") warc_directory = warc_dirs[-1] - print("") - print("----------") - print(f"Processing WARC files in {warc_directory}") + logger.info("") + logger.info("----------") + logger.info(f"Processing WARC files in {warc_directory}") warc2zim_args.append(str(warc_directory)) num_files = sum(1 for _ in warc_directory.iterdir()) - print(f"{num_files} WARC files found", flush=True) - print(f"Calling warc2zim with these args: {warc2zim_args}", flush=True) + logger.info(f"{num_files} WARC files found") + logger.info(f"Calling warc2zim with these args: {warc2zim_args}") return warc2zim(warc2zim_args) -def check_url(url, user_agent, scope=None): - url = urllib.parse.urlparse(url) +def check_url(url: str, user_agent: str, scope: str | None = None): + parsed_url = urllib.parse.urlparse(url) try: with requests.get( - url.geturl(), + parsed_url.geturl(), stream=True, allow_redirects=True, timeout=(12.2, 27), @@ -476,28 +499,28 @@ def check_url(url, user_agent, scope=None): ) as resp: resp.raise_for_status() except requests.exceptions.RequestException as exc: - print(f"failed to connect to {url.geturl()}: {exc}", flush=True) - raise SystemExit(1) + logger.info(f"failed to connect to {parsed_url.geturl()}: {exc}") + raise SystemExit(1) from None actual_url = urllib.parse.urlparse(resp.url) # remove explicit port in URI for default-for-scheme as browsers does it - if actual_url.scheme == "https" and actual_url.port == 443: + if actual_url.scheme == "https" and actual_url.port == 443: # noqa: PLR2004 actual_url = rebuild_uri(actual_url, port="") - if actual_url.scheme == "http" and actual_url.port == 80: + if actual_url.scheme == "http" and actual_url.port == 80: # noqa: PLR2004 actual_url = rebuild_uri(actual_url, port="") - if actual_url.geturl() != url.geturl(): + if actual_url.geturl() != parsed_url.geturl(): if scope in (None, "any"): return actual_url.geturl() - print( - "[WARN] Your URL ({0}) redirects to {1} which {2} on same " - "first-level domain. Depending on your scopeType ({3}), " + logger.info( + "[WARN] Your URL ({}) redirects to {} which {} on same " + "first-level domain. Depending on your scopeType ({}), " "your homepage might be out-of-scope. Please check!".format( - url.geturl(), + parsed_url.geturl(), actual_url.geturl(), "is" - if get_fld(url.geturl()) == get_fld(actual_url.geturl()) + if get_fld(parsed_url.geturl()) == get_fld(actual_url.geturl()) else "is not", scope, ) @@ -505,7 +528,7 @@ def check_url(url, user_agent, scope=None): return actual_url.geturl() - return url.geturl() + return parsed_url.geturl() def get_node_cmd_line(args): @@ -541,7 +564,7 @@ def get_node_cmd_line(args): "config", ]: value = getattr(args, arg) - if value == None or (isinstance(value, bool) and value == False): + if value is None or (isinstance(value, bool) and value is False): continue node_cmd.append("--" + arg) if not isinstance(value, bool): @@ -550,17 +573,22 @@ def get_node_cmd_line(args): return node_cmd -def sigint_handler(*args): - print("") - print("") - print("SIGINT/SIGTERM received, stopping zimit") - print("") - print("", flush=True) +def sigint_handler(*args): # noqa: ARG001 + logger.info("") + logger.info("") + logger.info("SIGINT/SIGTERM received, stopping zimit") + logger.info("") + logger.info("") sys.exit(3) +def zimit(): + run(sys.argv[1:]) + + signal.signal(signal.SIGINT, sigint_handler) signal.signal(signal.SIGTERM, sigint_handler) + if __name__ == "__main__": zimit() diff --git a/tasks.py b/tasks.py new file mode 100644 index 0000000..90854e8 --- /dev/null +++ b/tasks.py @@ -0,0 +1,109 @@ +# pyright: strict, reportUntypedFunctionDecorator=false +import os + +from invoke.context import Context +from invoke.tasks import task # pyright: ignore [reportUnknownVariableType] + +use_pty = not os.getenv("CI", "") + + +@task(optional=["args"], help={"args": "pytest additional arguments"}) +def test(ctx: Context, args: str = ""): + """run tests (without coverage)""" + ctx.run(f"pytest {args}", pty=use_pty) + + +@task(optional=["args"], help={"args": "pytest additional arguments"}) +def test_cov(ctx: Context, args: str = ""): + """run test vith coverage""" + ctx.run(f"coverage run -m pytest {args}", pty=use_pty) + + +@task(optional=["html"], help={"html": "flag to export html report"}) +def report_cov(ctx: Context, *, html: bool = False): + """report coverage""" + ctx.run("coverage combine", warn=True, pty=use_pty) + ctx.run("coverage report --show-missing", pty=use_pty) + if html: + ctx.run("coverage html", pty=use_pty) + + +@task( + optional=["args", "html"], + help={ + "args": "pytest additional arguments", + "html": "flag to export html report", + }, +) +def coverage(ctx: Context, args: str = "", *, html: bool = False): + """run tests and report coverage""" + test_cov(ctx, args=args) + report_cov(ctx, html=html) + + +@task(optional=["args"], help={"args": "black additional arguments"}) +def lint_black(ctx: Context, args: str = "."): + args = args or "." # needed for hatch script + ctx.run("black --version", pty=use_pty) + ctx.run(f"black --check --diff {args}", pty=use_pty) + + +@task(optional=["args"], help={"args": "ruff additional arguments"}) +def lint_ruff(ctx: Context, args: str = "."): + args = args or "." # needed for hatch script + ctx.run("ruff --version", pty=use_pty) + ctx.run(f"ruff check {args}", pty=use_pty) + + +@task( + optional=["args"], + help={ + "args": "linting tools (black, ruff) additional arguments, typically a path", + }, +) +def lintall(ctx: Context, args: str = "."): + """Check linting""" + args = args or "." # needed for hatch script + lint_black(ctx, args) + lint_ruff(ctx, args) + + +@task(optional=["args"], help={"args": "check tools (pyright) additional arguments"}) +def check_pyright(ctx: Context, args: str = ""): + """check static types with pyright""" + ctx.run("pyright --version") + ctx.run(f"pyright {args}", pty=use_pty) + + +@task(optional=["args"], help={"args": "check tools (pyright) additional arguments"}) +def checkall(ctx: Context, args: str = ""): + """check static types""" + check_pyright(ctx, args) + + +@task(optional=["args"], help={"args": "black additional arguments"}) +def fix_black(ctx: Context, args: str = "."): + """fix black formatting""" + args = args or "." # needed for hatch script + ctx.run(f"black {args}", pty=use_pty) + + +@task(optional=["args"], help={"args": "ruff additional arguments"}) +def fix_ruff(ctx: Context, args: str = "."): + """fix all ruff rules""" + args = args or "." # needed for hatch script + ctx.run(f"ruff --fix {args}", pty=use_pty) + + +@task( + optional=["args"], + help={ + "args": "linting tools (black, ruff) additional arguments, typically a path", + }, +) +def fixall(ctx: Context, args: str = "."): + """Fix everything automatically""" + args = args or "." # needed for hatch script + fix_black(ctx, args) + fix_ruff(ctx, args) + lintall(ctx, args) diff --git a/tests-integration/README.md b/tests-integration/README.md new file mode 100644 index 0000000..94da094 --- /dev/null +++ b/tests-integration/README.md @@ -0,0 +1 @@ +These are integration tests, meant to be ran inside the CI (because we need to first perform a zimit run on a given website and then check its output) diff --git a/test/integration.py b/tests-integration/integration.py similarity index 90% rename from test/integration.py rename to tests-integration/integration.py index 95f7924..17bfe9f 100644 --- a/test/integration.py +++ b/tests-integration/integration.py @@ -1,6 +1,6 @@ -import os import glob import json +import os import libzim.reader from warcio import ArchiveIterator @@ -26,14 +26,17 @@ def test_zim_main_page(): def test_user_agent(): - """Test that mobile user agent was used in WARC request records with custom Zimit and email suffix""" + """Test that mobile user agent was used + + Check is done in WARC request records with custom Zimit and email suffix + """ found = False for warc in glob.glob("/output/.tmp*/collections/crawl-*/archive/*.warc.gz"): with open(warc, "rb") as fh: for record in ArchiveIterator(fh): if record.rec_type == "request": - print(record.http_headers) + print(record.http_headers) # noqa: T201 ua = record.http_headers.get_header("User-Agent") if ua: assert "Mozilla" in ua diff --git a/tests/test_dummy.py b/tests/test_dummy.py new file mode 100644 index 0000000..dd89067 --- /dev/null +++ b/tests/test_dummy.py @@ -0,0 +1,6 @@ +from zimit.zimit import DEFAULT_USER_AGENT + + +# dummy test, just to have coverage report done +def test_default_user_agent(): + assert DEFAULT_USER_AGENT From c7fdc1d11e39ce7399f4ec0e12521821efdfeb0d Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 22 Jan 2024 10:38:25 +0100 Subject: [PATCH 07/29] Simplify logger name code --- src/zimit/zimit.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index e436feb..c2502a8 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -37,9 +37,7 @@ EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2 EXIT_CODE_CRAWLER_LIMIT_HIT = 11 NORMAL_WARC2ZIM_EXIT_CODE = 100 -LOGGER_NAME = Path(__file__).parent.name - -logger = getLogger(name=LOGGER_NAME, level=logging.INFO) +logger = getLogger(name="zimit", level=logging.INFO) class ProgressFileWatcher: From a505df9fe033e207bb11bd06af2ad02e57b1ddca Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 23 Jan 2024 17:28:56 +0100 Subject: [PATCH 08/29] Add support for --logging parameter of browsertrix crawler --- CHANGELOG.md | 1 + src/zimit/zimit.py | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d52f42..5871a4a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - New `--version` flag to display Zimit version +- New `--logging` flag to adjust Browsertrix Crawler logging (#273) ### Changed diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index c2502a8..74c19fa 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -353,6 +353,11 @@ def run(raw_args): version=f"Zimit {__version__}", ) + parser.add_argument( + "--logging", + help="Crawler logging configuration", + ) + zimit_args, warc2zim_args = parser.parse_known_args(raw_args) # pass url and output to warc2zim also @@ -560,6 +565,7 @@ def get_node_cmd_line(args): "healthCheckPort", "overwrite", "config", + "logging", ]: value = getattr(args, arg) if value is None or (isinstance(value, bool) and value is False): From ef462b50246c6f8ffc9973aadfc543d984335558 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 26 Jan 2024 16:34:26 +0100 Subject: [PATCH 09/29] Empty commit to release warc2zim2 commit ae18aed From 9244f2e69c9adfb81228be1c2185caab8fa40cbd Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 31 Jan 2024 14:56:09 +0100 Subject: [PATCH 10/29] Set zimit and browsertrix crawler versions in final ZIM 'Scraper' metadata --- CHANGELOG.md | 1 + src/zimit/__about__.py | 2 +- src/zimit/zimit.py | 16 ++++++++++++++++ tests-integration/integration.py | 21 ++++++++++++++------- 4 files changed, 32 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5871a4a..d5a432e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - New `--version` flag to display Zimit version - New `--logging` flag to adjust Browsertrix Crawler logging (#273) +- Use new `--scraper-suffix` flag of warc2zim to enhance ZIM "Scraper" metadata (#275) ### Changed diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index 494af57..11fb1b2 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "2.0.0-dev0" +__version__ = "2.0.0-dev1" diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index 74c19fa..d023c68 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -360,6 +360,22 @@ def run(raw_args): zimit_args, warc2zim_args = parser.parse_known_args(raw_args) + logger.info("Checking browsertrix-crawler version") + crawl_version_cmd = ["crawl", "--version"] + crawl = subprocess.run(crawl_version_cmd, check=False, capture_output=True) + if crawl.returncode: + raise subprocess.CalledProcessError(crawl.returncode, crawl_version_cmd) + else: + crawler_version = crawl.stdout.decode("utf-8").strip() + logger.info(f"Browsertrix crawler: version {crawler_version}") + + # pass a scraper suffix to warc2zim so that both zimit, warc2zim and crawler + # versions are associated with the ZIM + warc2zim_args.append("--scraper-suffix") + warc2zim_args.append( + f" + zimit {__version__} + Browsertrix crawler {crawler_version}" + ) + # pass url and output to warc2zim also if zimit_args.output: warc2zim_args.append("--output") diff --git a/tests-integration/integration.py b/tests-integration/integration.py index 17bfe9f..0463fad 100644 --- a/tests-integration/integration.py +++ b/tests-integration/integration.py @@ -2,13 +2,9 @@ import glob import json import os -import libzim.reader +from libzim.reader import Archive as LibzimArchive from warcio import ArchiveIterator - - -def get_zim_main_entry(zimfile): - zim_fh = libzim.reader.Archive(zimfile) - return zim_fh.main_entry +from zimscraperlib.zim import Archive as ScraperLibArchive def test_is_file(): @@ -20,11 +16,22 @@ def test_zim_main_page(): """Main page specified, http://isago.rskg.org/, was a redirect to https Ensure main page is the redirected page""" - main_entry = get_zim_main_entry("/output/isago.zim") + main_entry = LibzimArchive("/output/isago.zim").main_entry assert main_entry.is_redirect assert main_entry.get_redirect_entry().path == "isago.rskg.org/" +def test_zim_scraper(): + """Main page specified, http://isago.rskg.org/, was a redirect to https + Ensure main page is the redirected page""" + + zim_fh = ScraperLibArchive("/output/isago.zim") + scraper = zim_fh.get_text_metadata("Scraper") + assert "zimit " in scraper + assert "warc2zim " in scraper + assert "Browsertrix crawler " in scraper + + def test_user_agent(): """Test that mobile user agent was used From 49da57c5b686f272db7e286909734963daabd89a Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 5 Feb 2024 14:33:38 +0100 Subject: [PATCH 11/29] fixup! Set zimit and browsertrix crawler versions in final ZIM 'Scraper' metadata --- src/zimit/zimit.py | 23 ++++++++++++++--------- tests-integration/integration.py | 7 +++---- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index d023c68..5d91607 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -362,12 +362,15 @@ def run(raw_args): logger.info("Checking browsertrix-crawler version") crawl_version_cmd = ["crawl", "--version"] - crawl = subprocess.run(crawl_version_cmd, check=False, capture_output=True) - if crawl.returncode: - raise subprocess.CalledProcessError(crawl.returncode, crawl_version_cmd) - else: - crawler_version = crawl.stdout.decode("utf-8").strip() - logger.info(f"Browsertrix crawler: version {crawler_version}") + try: + crawl = subprocess.run( + crawl_version_cmd, check=True, capture_output=True, text=True + ) + except Exception: + logger.error("Failed to get Browsertrix crawler version") + raise + crawler_version = crawl.stdout + logger.info(f"Browsertrix crawler: version {crawler_version}") # pass a scraper suffix to warc2zim so that both zimit, warc2zim and crawler # versions are associated with the ZIM @@ -538,9 +541,11 @@ def check_url(url: str, user_agent: str, scope: str | None = None): "your homepage might be out-of-scope. Please check!".format( parsed_url.geturl(), actual_url.geturl(), - "is" - if get_fld(parsed_url.geturl()) == get_fld(actual_url.geturl()) - else "is not", + ( + "is" + if get_fld(parsed_url.geturl()) == get_fld(actual_url.geturl()) + else "is not" + ), scope, ) ) diff --git a/tests-integration/integration.py b/tests-integration/integration.py index 0463fad..14c7ad8 100644 --- a/tests-integration/integration.py +++ b/tests-integration/integration.py @@ -2,9 +2,8 @@ import glob import json import os -from libzim.reader import Archive as LibzimArchive from warcio import ArchiveIterator -from zimscraperlib.zim import Archive as ScraperLibArchive +from zimscraperlib.zim import Archive def test_is_file(): @@ -16,7 +15,7 @@ def test_zim_main_page(): """Main page specified, http://isago.rskg.org/, was a redirect to https Ensure main page is the redirected page""" - main_entry = LibzimArchive("/output/isago.zim").main_entry + main_entry = Archive("/output/isago.zim").main_entry assert main_entry.is_redirect assert main_entry.get_redirect_entry().path == "isago.rskg.org/" @@ -25,7 +24,7 @@ def test_zim_scraper(): """Main page specified, http://isago.rskg.org/, was a redirect to https Ensure main page is the redirected page""" - zim_fh = ScraperLibArchive("/output/isago.zim") + zim_fh = Archive("/output/isago.zim") scraper = zim_fh.get_text_metadata("Scraper") assert "zimit " in scraper assert "warc2zim " in scraper From 01c5833c290d1aa57ffa14a4858f0c7256238380 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 9 Feb 2024 11:10:57 +0100 Subject: [PATCH 12/29] Empty commit to release warc2zim2 commit f837179 From 6ca9be48c71cb575623293d728feb9f9b1e3862b Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 16 Feb 2024 10:03:04 +0100 Subject: [PATCH 13/29] Empty commit to release warc2zim2 commit 3c00da0 From a44c1a7c7ff2f520370086a111bdeb0ce2689465 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 1 Mar 2024 13:46:05 +0000 Subject: [PATCH 14/29] Upgrade dependencies --- .pre-commit-config.yaml | 6 +++--- pyproject.toml | 24 +++++++++++++----------- tasks.py | 2 +- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 42d7ca3..b93b4a9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,15 +7,15 @@ repos: - id: trailing-whitespace - id: end-of-file-fixer - repo: https://github.com/psf/black - rev: "23.12.1" + rev: "24.2.0" hooks: - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.3 + rev: v0.3.0 hooks: - id: ruff - repo: https://github.com/RobertCraigie/pyright-python - rev: v1.1.347 + rev: v1.1.352 hooks: - id: pyright name: pyright (system) diff --git a/pyproject.toml b/pyproject.toml index ad7db1f..177acf1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,19 +33,19 @@ scripts = [ "invoke==2.2.0", ] lint = [ - "black==23.12.1", - "ruff==0.1.3", + "black==24.2.0", + "ruff==0.3.0", ] check = [ - "pyright==1.1.347", + "pyright==1.1.352", ] test = [ - "pytest==7.4.4", - "coverage==7.4.0", + "pytest==8.0.2", + "coverage==7.4.3", ] dev = [ - "pre-commit==3.6.0", - "debugpy==1.8.0", + "pre-commit==3.6.2", + "debugpy==1.8.1", "zimit[scripts]", "zimit[lint]", "zimit[test]", @@ -111,6 +111,8 @@ target-version = ['py311'] target-version = "py311" line-length = 88 src = ["src"] + +[tool.ruff.lint] select = [ "A", # flake8-builtins # "ANN", # flake8-annotations @@ -187,17 +189,17 @@ unfixable = [ "F401", ] -[tool.ruff.isort] +[tool.ruff.lint.isort] known-first-party = ["zimit"] -[tool.ruff.flake8-bugbear] +[tool.ruff.lint.flake8-bugbear] # add exceptions to B008 for fastapi. extend-immutable-calls = ["fastapi.Depends", "fastapi.Query"] -[tool.ruff.flake8-tidy-imports] +[tool.ruff.lint.flake8-tidy-imports] ban-relative-imports = "all" -[tool.ruff.per-file-ignores] +[tool.ruff.lint.per-file-ignores] # Tests can use magic values, assertions, and relative imports "tests**/**/*" = ["PLR2004", "S101", "TID252"] diff --git a/tasks.py b/tasks.py index 90854e8..a95c71a 100644 --- a/tasks.py +++ b/tasks.py @@ -92,7 +92,7 @@ def fix_black(ctx: Context, args: str = "."): def fix_ruff(ctx: Context, args: str = "."): """fix all ruff rules""" args = args or "." # needed for hatch script - ctx.run(f"ruff --fix {args}", pty=use_pty) + ctx.run(f"ruff check --fix {args}", pty=use_pty) @task( From 89aea6b41ed95cbfd9a6c301073027ea88d5380c Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 1 Mar 2024 13:47:13 +0000 Subject: [PATCH 15/29] Adopt hatch-openzim plugin --- pyproject.toml | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 177acf1..ac7113f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,29 +1,22 @@ [build-system] -requires = ["hatchling"] +requires = ["hatchling", "hatch-openzim==0.2.0"] build-backend = "hatchling.build" [project] name = "zimit" -authors = [ - { name = "Kiwix", email = "dev@kiwix.org" }, -] -keywords = ["some"] requires-python = ">=3.11,<3.12" description = "Make ZIM file from any website through crawling" readme = "README.md" -license = {text = "GPL-3.0-or-later"} -classifiers = [ - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.11", - "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", -] dependencies = [ "requests==2.31.0", "inotify==0.2.10", "tld==0.13", "warc2zim @ git+https://github.com/openzim/warc2zim@warc2zim2", ] -dynamic = ["version"] +dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] + +[tool.hatch.metadata.hooks.openzim-metadata] +kind = "scraper" [tool.hatch.metadata] allow-direct-references = true # to be removed once we use a released warc2zim version @@ -52,10 +45,6 @@ dev = [ "zimit[check]", ] -[project.urls] -Homepage = "https://github.com/openzim/zimit" -Donate = "https://www.kiwix.org/en/support-us/" - [project.scripts] zimit = "zimit:zimit.zimit" From 857ae5674de6ce80a18a86332927be10cdd2804d Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 1 Mar 2024 13:47:27 +0000 Subject: [PATCH 16/29] Upgrade to Python 3.12 --- Dockerfile | 7 +++++-- pyproject.toml | 8 ++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index 3435fbf..7126f44 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,13 +1,16 @@ FROM webrecorder/browsertrix-crawler:0.12.4 LABEL org.opencontainers.image.source https://github.com/openzim/zimit +# add deadsnakes ppa for Python 3.12 on Ubuntu Jammy +RUN add-apt-repository ppa:deadsnakes/ppa -y + RUN apt-get update \ && apt-get install -qqy --no-install-recommends \ libmagic1 \ - python3.11-venv \ + python3.12-venv \ && rm -rf /var/lib/apt/lists/* \ # python setup (in venv not to conflict with browsertrix) - && python3.11 -m venv /app/zimit \ + && python3.12 -m venv /app/zimit \ # placeholder (default output location) && mkdir -p /output \ # disable chrome upgrade diff --git a/pyproject.toml b/pyproject.toml index ac7113f..4648ac4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "zimit" -requires-python = ">=3.11,<3.12" +requires-python = ">=3.12,<3.13" description = "Make ZIM file from any website through crawling" readme = "README.md" dependencies = [ @@ -94,10 +94,10 @@ all = "inv checkall --args '{args}'" [tool.black] line-length = 88 -target-version = ['py311'] +target-version = ['py312'] [tool.ruff] -target-version = "py311" +target-version = "py312" line-length = 88 src = ["src"] @@ -220,5 +220,5 @@ exclude_lines = [ include = ["src", "tests", "tasks.py"] exclude = [".env/**", ".venv/**"] extraPaths = ["src"] -pythonVersion = "3.11" +pythonVersion = "3.12" typeCheckingMode="basic" From b69f3d610f4b7609f5a1bfcc6726dd79061c4e80 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 29 Feb 2024 17:02:34 +0100 Subject: [PATCH 17/29] Upgrade to crawler 1.0.0-beta5 --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 7126f44..a34b671 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM webrecorder/browsertrix-crawler:0.12.4 +FROM webrecorder/browsertrix-crawler:1.0.0-beta.5 LABEL org.opencontainers.image.source https://github.com/openzim/zimit # add deadsnakes ppa for Python 3.12 on Ubuntu Jammy From 4d31f8eabb2b1a2d67afccf8c084572f484d7886 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 29 Feb 2024 17:04:09 +0100 Subject: [PATCH 18/29] Remove handling of redirects which are now done by browsertrix crawler --- src/zimit/zimit.py | 48 ++++++---------------------------------------- 1 file changed, 6 insertions(+), 42 deletions(-) diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index 5d91607..178d449 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -20,8 +20,6 @@ from pathlib import Path import inotify import inotify.adapters -import requests -from tld import get_fld from warc2zim.main import main as warc2zim from zimscraperlib.logging import getLogger from zimscraperlib.uri import rebuild_uri @@ -393,7 +391,7 @@ def run(raw_args): user_agent += f" {zimit_args.adminEmail}" if url: - url = check_url(url, user_agent, zimit_args.scopeType) + url = clean_url(url) warc2zim_args.append("--url") warc2zim_args.append(url) @@ -509,48 +507,14 @@ def run(raw_args): return warc2zim(warc2zim_args) -def check_url(url: str, user_agent: str, scope: str | None = None): +def clean_url(url: str): parsed_url = urllib.parse.urlparse(url) - try: - with requests.get( - parsed_url.geturl(), - stream=True, - allow_redirects=True, - timeout=(12.2, 27), - headers={"User-Agent": user_agent}, - ) as resp: - resp.raise_for_status() - except requests.exceptions.RequestException as exc: - logger.info(f"failed to connect to {parsed_url.geturl()}: {exc}") - raise SystemExit(1) from None - actual_url = urllib.parse.urlparse(resp.url) # remove explicit port in URI for default-for-scheme as browsers does it - if actual_url.scheme == "https" and actual_url.port == 443: # noqa: PLR2004 - actual_url = rebuild_uri(actual_url, port="") - if actual_url.scheme == "http" and actual_url.port == 80: # noqa: PLR2004 - actual_url = rebuild_uri(actual_url, port="") - - if actual_url.geturl() != parsed_url.geturl(): - if scope in (None, "any"): - return actual_url.geturl() - - logger.info( - "[WARN] Your URL ({}) redirects to {} which {} on same " - "first-level domain. Depending on your scopeType ({}), " - "your homepage might be out-of-scope. Please check!".format( - parsed_url.geturl(), - actual_url.geturl(), - ( - "is" - if get_fld(parsed_url.geturl()) == get_fld(actual_url.geturl()) - else "is not" - ), - scope, - ) - ) - - return actual_url.geturl() + if parsed_url.scheme == "https" and parsed_url.port == 443: # noqa: PLR2004 + parsed_url = rebuild_uri(parsed_url, port="") + if parsed_url.scheme == "http" and parsed_url.port == 80: # noqa: PLR2004 + parsed_url = rebuild_uri(parsed_url, port="") return parsed_url.geturl() From a4cb27a79347e690af3b07c5137cb1256fdc2573 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 4 Mar 2024 10:02:50 +0000 Subject: [PATCH 19/29] Fix clean_url method name --- src/zimit/zimit.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index 178d449..dbf3965 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -391,7 +391,7 @@ def run(raw_args): user_agent += f" {zimit_args.adminEmail}" if url: - url = clean_url(url) + url = get_cleaned_url(url) warc2zim_args.append("--url") warc2zim_args.append(url) @@ -507,7 +507,7 @@ def run(raw_args): return warc2zim(warc2zim_args) -def clean_url(url: str): +def get_cleaned_url(url: str): parsed_url = urllib.parse.urlparse(url) # remove explicit port in URI for default-for-scheme as browsers does it From a9769b28717e0be511539593ff314c3a144e0ca2 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 7 Mar 2024 08:00:31 +0000 Subject: [PATCH 20/29] Upgrade to crawler 1.0.0-beta6 --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index a34b671..53306f3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM webrecorder/browsertrix-crawler:1.0.0-beta.5 +FROM webrecorder/browsertrix-crawler:1.0.0-beta.6 LABEL org.opencontainers.image.source https://github.com/openzim/zimit # add deadsnakes ppa for Python 3.12 on Ubuntu Jammy From 456219deb3a4deba3fd867d749bbe4e78c0f8ad0 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 7 Mar 2024 08:40:32 +0000 Subject: [PATCH 21/29] Fix tests, there are in fact only 7 items to be pushed to the ZIM 7 entries are expected: https://isago.rskg.org/ https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css https://isago.rskg.org/static/favicon256.png https://isago.rskg.org/conseils https://isago.rskg.org/faq https://isago.rskg.org/a-propos https://isago.rskg.org/static/tarifs-isago.pdf 1 unexpected entry is not produced anymore by Browsertrix crawler: https://dict.brave.com/edgedl/chrome/dict/en-us-10-1.bdic This was a technical artifact --- tests-integration/integration.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests-integration/integration.py b/tests-integration/integration.py index 14c7ad8..9550386 100644 --- a/tests-integration/integration.py +++ b/tests-integration/integration.py @@ -65,12 +65,12 @@ def test_stats_output(): } with open("/output/warc2zim.json") as fh: assert json.loads(fh.read()) == { - "written": 8, - "total": 8, + "written": 7, + "total": 7, } with open("/output/stats.json") as fh: assert json.loads(fh.read()) == { - "done": 8, - "total": 8, + "done": 7, + "total": 7, "limit": {"max": 0, "hit": False}, } From 5c716747b476a434342ddf33abfd3c87cecc7387 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 7 Mar 2024 08:40:55 +0000 Subject: [PATCH 22/29] Add CHANGELOG --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d5a432e..ac640f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Using `warc2zim2` warc2zim ⚠️ change before releasing! - Build temporary `zimit2` Docker image for testing ⚠️ remove before releasing! - Adopt Python bootstrap conventions +- Removed handling of redirects by zimit, they are handled by browsertrix crawler and detected properly by warc2zim +- Upgrade to Python 3.12 + upgrade dependencies ## [1.6.3] - 2024-01-18 From 54732692ac5935df89bfe455e9fecb1229fb9e2a Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 7 Mar 2024 12:47:38 +0000 Subject: [PATCH 23/29] Bump dev version --- src/zimit/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index 11fb1b2..c8587ef 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "2.0.0-dev1" +__version__ = "2.0.0-dev2" From 3070fe9724c48967438c060c0253b449d9f8776b Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 27 Mar 2024 13:16:00 +0000 Subject: [PATCH 24/29] Rollback previous changes around the presence of a default user-agent - Remove default userAgent value - Set a default mobileDevice - Add back comments explaining that userAgent overrides other settings - Add back logic around the computation of the userAgentSuffix instead of the userAgent - Add new noMobileDevice argument to not set the default mobileDevice --- src/zimit/zimit.py | 35 ++++++++++++++++++++--------------- tests/test_dummy.py | 6 +++--- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index dbf3965..a1d45d8 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -26,11 +26,6 @@ from zimscraperlib.uri import rebuild_uri from zimit.__about__ import __version__ -DEFAULT_USER_AGENT = ( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 " - "(KHTML, like Gecko) Version/17.0 Safari/605.1.15" -) - EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2 EXIT_CODE_CRAWLER_LIMIT_HIT = 11 NORMAL_WARC2ZIM_EXIT_CODE = 100 @@ -230,13 +225,21 @@ def run(raw_args): help="Emulate mobile device by name from " "https://github.com/puppeteer/puppeteer/blob/" "main/packages/puppeteer-core/src/common/Device.ts", + default="Pixel 2", + ) + + parser.add_argument( + "--noMobileDevice", + help="Do not emulate a mobile device (use at your own risk, behavior is" + "uncertain)", + action="store_true", + default=False, ) parser.add_argument( "--userAgent", - help="Override default user-agent with specified value ; --userAgentSuffix is " - "still applied", - default=DEFAULT_USER_AGENT, + help="Override default user-agent with specified value ; --userAgentSuffix and " + "--adminEmail have no effect when this is set", ) parser.add_argument( @@ -384,11 +387,9 @@ def run(raw_args): url = zimit_args.url - user_agent = zimit_args.userAgent - if zimit_args.userAgentSuffix: - user_agent += f" {zimit_args.userAgentSuffix}" + user_agent_suffix = zimit_args.userAgentSuffix if zimit_args.adminEmail: - user_agent += f" {zimit_args.adminEmail}" + user_agent_suffix += f" {zimit_args.adminEmail}" if url: url = get_cleaned_url(url) @@ -443,8 +444,12 @@ def run(raw_args): cmd_args.append("--url") cmd_args.append(url) - cmd_args.append("--userAgent") - cmd_args.append(user_agent) + cmd_args.append("--userAgentSuffix") + cmd_args.append(user_agent_suffix) + + if not zimit_args.noMobileDevice: + cmd_args.append("--mobileDevice") + cmd_args.append(zimit_args.mobileDevice) cmd_args.append("--cwd") cmd_args.append(str(temp_root_dir)) @@ -538,7 +543,7 @@ def get_node_cmd_line(args): "collection", "allowHashUrls", "lang", - "mobileDevice", + "userAgent", "useSitemap", "behaviors", "behaviorTimeout", diff --git a/tests/test_dummy.py b/tests/test_dummy.py index dd89067..54af094 100644 --- a/tests/test_dummy.py +++ b/tests/test_dummy.py @@ -1,6 +1,6 @@ -from zimit.zimit import DEFAULT_USER_AGENT +from zimit.zimit import NORMAL_WARC2ZIM_EXIT_CODE # dummy test, just to have coverage report done -def test_default_user_agent(): - assert DEFAULT_USER_AGENT +def test_something_exists(): + assert NORMAL_WARC2ZIM_EXIT_CODE From e24479945f79113694cb76e14a62e4ddf2c2020e Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 27 Mar 2024 13:18:04 +0000 Subject: [PATCH 25/29] Remove trailing characters when retrieving Browsertrix Crawler version --- CHANGELOG.md | 7 +++++++ src/zimit/zimit.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ac640f3..a4d077b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - New `--version` flag to display Zimit version - New `--logging` flag to adjust Browsertrix Crawler logging (#273) - Use new `--scraper-suffix` flag of warc2zim to enhance ZIM "Scraper" metadata (#275) +- New `--noMobileDevice` CLI argument ### Changed @@ -21,6 +22,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Adopt Python bootstrap conventions - Removed handling of redirects by zimit, they are handled by browsertrix crawler and detected properly by warc2zim - Upgrade to Python 3.12 + upgrade dependencies +- `--userAgent` CLI argument overrides again the `--userAgentSuffix` and `--adminEmail` values +- `--userAgent` CLI arguement is not mandatory anymore + +### Fixed + +- Fix support for Youtube videos (#291) ## [1.6.3] - 2024-01-18 diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index a1d45d8..7c2764a 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -370,7 +370,7 @@ def run(raw_args): except Exception: logger.error("Failed to get Browsertrix crawler version") raise - crawler_version = crawl.stdout + crawler_version = crawl.stdout.strip() logger.info(f"Browsertrix crawler: version {crawler_version}") # pass a scraper suffix to warc2zim so that both zimit, warc2zim and crawler From 728784d6bf143735cf03d05ee2fa52f838404cf3 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 27 Mar 2024 13:22:59 +0000 Subject: [PATCH 26/29] Upgrade Browsertrix Crawler to 1.0.3 --- CHANGELOG.md | 1 + Dockerfile | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a4d077b..81dc91d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Upgrade to Python 3.12 + upgrade dependencies - `--userAgent` CLI argument overrides again the `--userAgentSuffix` and `--adminEmail` values - `--userAgent` CLI arguement is not mandatory anymore +- Upgraded Browsertrix Crawler to 1.0.3 ### Fixed diff --git a/Dockerfile b/Dockerfile index 53306f3..c1731d9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM webrecorder/browsertrix-crawler:1.0.0-beta.6 +FROM webrecorder/browsertrix-crawler:1.0.3 LABEL org.opencontainers.image.source https://github.com/openzim/zimit # add deadsnakes ppa for Python 3.12 on Ubuntu Jammy From d54aa22bb2f6824ea13b7fe45116f506a8f0ffd3 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 19 Apr 2024 12:30:53 +0000 Subject: [PATCH 27/29] Upgrade to Browsertrix Crawler 1.1.0 --- Dockerfile | 2 +- src/zimit/__about__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index c1731d9..13b96bf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM webrecorder/browsertrix-crawler:1.0.3 +FROM webrecorder/browsertrix-crawler:1.1.0 LABEL org.opencontainers.image.source https://github.com/openzim/zimit # add deadsnakes ppa for Python 3.12 on Ubuntu Jammy diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index c8587ef..aacbf87 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "2.0.0-dev2" +__version__ = "2.0.0-dev3" From 9a7415a4029985ed83dafd3b656bfcbfdc95a165 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Sat, 4 May 2024 19:00:08 +0000 Subject: [PATCH 28/29] Upgrade to Browsertrix Crawler 1.1.1 Continue to use warc2zim 2.0.0-dev5 for now, Docker build issue with new stuff in warc2zim 2.0.0-dev6, will be fixed later on --- Dockerfile | 2 +- pyproject.toml | 2 +- src/zimit/__about__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 13b96bf..a7162ba 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM webrecorder/browsertrix-crawler:1.1.0 +FROM webrecorder/browsertrix-crawler:1.1.1 LABEL org.opencontainers.image.source https://github.com/openzim/zimit # add deadsnakes ppa for Python 3.12 on Ubuntu Jammy diff --git a/pyproject.toml b/pyproject.toml index 4648ac4..a2251a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "requests==2.31.0", "inotify==0.2.10", "tld==0.13", - "warc2zim @ git+https://github.com/openzim/warc2zim@warc2zim2", + "warc2zim @ git+https://github.com/openzim/warc2zim@warc2zim2-dev5", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index aacbf87..9eca36d 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "2.0.0-dev3" +__version__ = "2.0.0-dev4" From 1d54b208738e9deac41d5708c9930b4387aba92f Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 6 May 2024 09:55:38 +0000 Subject: [PATCH 29/29] Upgrade to warc2zim 2.0.0-dev6 --- Dockerfile | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index a7162ba..5aacae8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -32,14 +32,14 @@ COPY pyproject.toml README.md /src/ COPY src/zimit/__about__.py /src/src/zimit/__about__.py # Install Python dependencies -RUN /app/zimit/bin/python -m pip install --no-cache-dir /src +RUN . /app/zimit/bin/activate && python -m pip install --no-cache-dir /src # Copy code + associated artifacts COPY src /src/src COPY *.md /src/ # Install + cleanup -RUN /app/zimit/bin/python -m pip install --no-cache-dir /src \ +RUN . /app/zimit/bin/activate && python -m pip install --no-cache-dir /src \ && ln -s /app/zimit/bin/zimit /usr/bin/zimit \ && chmod +x /usr/bin/zimit \ && rm -rf /src diff --git a/pyproject.toml b/pyproject.toml index a2251a7..4648ac4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "requests==2.31.0", "inotify==0.2.10", "tld==0.13", - "warc2zim @ git+https://github.com/openzim/warc2zim@warc2zim2-dev5", + "warc2zim @ git+https://github.com/openzim/warc2zim@warc2zim2", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]