Merge pull request #60 from openzim/stats

stats: add support for stats output after every page crawled, fixes #39
This commit is contained in:
rgaudin 2020-12-04 11:21:44 +00:00 committed by GitHub
commit b9ed1d00a2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 18 additions and 2 deletions

View File

@ -13,7 +13,7 @@ jobs:
run: docker build -t openzim/zimit:dev .
- name: run crawl
run: docker run -v $PWD/output:/output openzim/zimit:dev zimit --url http://isago.ml/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice --keep
run: docker run -v $PWD/output:/output openzim/zimit:dev zimit --url http://isago.ml/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice --statsFilename /output/stats.json --keep
- name: run integration test suite
run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output openzim/zimit:dev bash -c "pip install pytest; pytest -v ./integration.py"

View File

@ -1,4 +1,4 @@
FROM webrecorder/browsertrix-crawler:0.1.2
FROM webrecorder/browsertrix-crawler:0.1.3
RUN mkdir -p /output

View File

@ -1,5 +1,6 @@
import os
import glob
import json
import libzim.reader
from warcio import ArchiveIterator
@ -41,3 +42,12 @@ def test_user_agent():
# should find at least one
assert found
def test_stats_output():
with open("/output/stats.json") as fh:
assert json.loads(fh.read()) == {
"numCrawled": 5,
"workersRunning": 0,
"total": 5,
}

View File

@ -94,6 +94,11 @@ def zimit(args=None):
help="If set, use the URL as sitemap to get additional URLs for the crawl (usually /sitemap.xml)",
)
parser.add_argument(
"--statsFilename",
help="If set, output stats as JSON to this file",
)
zimit_args, warc2zim_args = parser.parse_known_args(args)
# pass url and output to warc2zim also
@ -199,6 +204,7 @@ def get_node_cmd_line(args):
"scroll",
"mobileDevice",
"useSitemap",
"statsFilename",
]:
value = getattr(args, arg)
if value: