From 5084c54af6bc9b40f157d0a7f19254d5335dccbc Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 2 Dec 2020 16:28:25 +0000 Subject: [PATCH] stats: add support for stats output after every page crawled, fixes #39 tests: integration test checks for stats.json --- .github/workflows/ci.yaml | 2 +- Dockerfile | 2 +- test/integration.py | 10 ++++++++++ zimit.py | 6 ++++++ 4 files changed, 18 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 4fcf517..c9d77b7 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -13,7 +13,7 @@ jobs: run: docker build -t openzim/zimit:dev . - name: run crawl - run: docker run -v $PWD/output:/output openzim/zimit:dev zimit --url http://isago.ml/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice --keep + run: docker run -v $PWD/output:/output openzim/zimit:dev zimit --url http://isago.ml/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice --statsFilename /output/stats.json --keep - name: run integration test suite run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output openzim/zimit:dev bash -c "pip install pytest; pytest -v ./integration.py" diff --git a/Dockerfile b/Dockerfile index 5d7c8b3..32b910b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM webrecorder/browsertrix-crawler:0.1.2 +FROM webrecorder/browsertrix-crawler:0.1.3 RUN mkdir -p /output diff --git a/test/integration.py b/test/integration.py index cbc9ecc..b823367 100644 --- a/test/integration.py +++ b/test/integration.py @@ -1,5 +1,6 @@ import os import glob +import json import libzim.reader from warcio import ArchiveIterator @@ -41,3 +42,12 @@ def test_user_agent(): # should find at least one assert found + + +def test_stats_output(): + with open("/output/stats.json") as fh: + assert json.loads(fh.read()) == { + "numCrawled": 5, + "workersRunning": 0, + "total": 5, + } diff --git a/zimit.py b/zimit.py index fe06355..3436b9b 100755 --- a/zimit.py +++ b/zimit.py @@ -94,6 +94,11 @@ def zimit(args=None): help="If set, use the URL as sitemap to get additional URLs for the crawl (usually /sitemap.xml)", ) + parser.add_argument( + "--statsFilename", + help="If set, output stats as JSON to this file", + ) + zimit_args, warc2zim_args = parser.parse_known_args(args) # pass url and output to warc2zim also @@ -199,6 +204,7 @@ def get_node_cmd_line(args): "scroll", "mobileDevice", "useSitemap", + "statsFilename", ]: value = getattr(args, arg) if value: