mirror of
https://github.com/openzim/zimit.git
synced 2025-09-22 19:38:32 -04:00
Merge pull request #60 from openzim/stats
stats: add support for stats output after every page crawled, fixes #39
This commit is contained in:
commit
b9ed1d00a2
2
.github/workflows/ci.yaml
vendored
2
.github/workflows/ci.yaml
vendored
@ -13,7 +13,7 @@ jobs:
|
||||
run: docker build -t openzim/zimit:dev .
|
||||
|
||||
- name: run crawl
|
||||
run: docker run -v $PWD/output:/output openzim/zimit:dev zimit --url http://isago.ml/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice --keep
|
||||
run: docker run -v $PWD/output:/output openzim/zimit:dev zimit --url http://isago.ml/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice --statsFilename /output/stats.json --keep
|
||||
|
||||
- name: run integration test suite
|
||||
run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output openzim/zimit:dev bash -c "pip install pytest; pytest -v ./integration.py"
|
||||
|
@ -1,4 +1,4 @@
|
||||
FROM webrecorder/browsertrix-crawler:0.1.2
|
||||
FROM webrecorder/browsertrix-crawler:0.1.3
|
||||
|
||||
RUN mkdir -p /output
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
import os
|
||||
import glob
|
||||
import json
|
||||
|
||||
import libzim.reader
|
||||
from warcio import ArchiveIterator
|
||||
@ -41,3 +42,12 @@ def test_user_agent():
|
||||
|
||||
# should find at least one
|
||||
assert found
|
||||
|
||||
|
||||
def test_stats_output():
|
||||
with open("/output/stats.json") as fh:
|
||||
assert json.loads(fh.read()) == {
|
||||
"numCrawled": 5,
|
||||
"workersRunning": 0,
|
||||
"total": 5,
|
||||
}
|
||||
|
6
zimit.py
6
zimit.py
@ -94,6 +94,11 @@ def zimit(args=None):
|
||||
help="If set, use the URL as sitemap to get additional URLs for the crawl (usually /sitemap.xml)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--statsFilename",
|
||||
help="If set, output stats as JSON to this file",
|
||||
)
|
||||
|
||||
zimit_args, warc2zim_args = parser.parse_known_args(args)
|
||||
|
||||
# pass url and output to warc2zim also
|
||||
@ -199,6 +204,7 @@ def get_node_cmd_line(args):
|
||||
"scroll",
|
||||
"mobileDevice",
|
||||
"useSitemap",
|
||||
"statsFilename",
|
||||
]:
|
||||
value = getattr(args, arg)
|
||||
if value:
|
||||
|
Loading…
x
Reference in New Issue
Block a user