From 3ffa34d46e3b904635345a57aea9254f7949e4bb Mon Sep 17 00:00:00 2001 From: renaud gaudin Date: Thu, 10 Dec 2020 10:04:17 +0000 Subject: [PATCH 1/2] Enhanced --statsFilename support - `--statsFilename` to now represent overall zimit progress and not just crawling - Exposing a simpler (`done`, `total`) json format for progress - Live converting individual step's progres into this file - using warc2zim 1.3.3 for its `--progress-file` support - Currently arbitrarily assigning 90% to crawl and 10% to warc2zim --- Dockerfile | 2 +- zimit.py | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 86 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 32b910b..913a69a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,7 @@ RUN mkdir -p /output WORKDIR /app -RUN pip install 'warc2zim>=1.3.2' 'requests>=2.24.0' +RUN pip install 'warc2zim>=1.3.3' 'requests>=2.24.0' 'inotify==0.2.10' ADD zimit.py /app/ diff --git a/zimit.py b/zimit.py index 3436b9b..86981a9 100755 --- a/zimit.py +++ b/zimit.py @@ -15,11 +15,81 @@ import atexit import shutil import signal import sys +import json from pathlib import Path from urllib.parse import urlsplit +from multiprocessing import Process from warc2zim.main import warc2zim import requests +import inotify +import inotify.adapters + + +class ProgressFileWatcher: + def __init__(self, output_dir, stats_path): + self.crawl_path = output_dir / "crawl.json" + self.warc2zim_path = output_dir / "warc2zim.json" + self.stats_path = Path(stats_path) + + if not self.stats_path.is_absolute(): + self.stats_path = output_dir / self.stats_path + + # touch them all so inotify is not unhappy on add_watch + self.crawl_path.touch() + self.warc2zim_path.touch() + self.stats_path.touch() + self.process = None + + def stop(self): + self.process.join(0.1) + self.process.terminate() + + def watch(self): + self.process = Process( + target=self.inotify_watcher, + args=(str(self.crawl_path), str(self.warc2zim_path), str(self.stats_path)), + ) + self.process.daemon = True + self.process.start() + + @staticmethod + def inotify_watcher(crawl_fpath, warc2zim_fpath, output_fpath): + ino = inotify.adapters.Inotify() + ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY) + ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY) + + def crawl_conv(data): + # we consider crawl to be 90% of the workload so total = craw_total * 90% + return {"done": data["numCrawled"], "total": int(data["total"] / 0.9)} + + def warc2zim_conv(data): + # we consider warc2zim to be 10% of the workload so + # warc2zim_total = 10% and total = 90 + warc2zim_total * 10% + return { + "done": int( + data["total"] + * (0.9 + (float(data["written"]) / data["total"]) / 10) + ), + "total": data["total"], + } + + for _, _, fpath, _ in ino.event_gen(yield_nones=False): + func = {crawl_fpath: crawl_conv, warc2zim_fpath: warc2zim_conv}.get(fpath) + if not func: + continue + # open input and output separatly as to not clear output on error + with open(fpath, "r") as ifh: + try: + out = func(json.load(ifh)) + except Exception: # nosec + # simply ignore progress update should an error arise + # might be malformed input for instance + continue + if not out: + continue + with open(output_fpath, "w") as ofh: + json.dump(out, ofh) def zimit(args=None): @@ -149,6 +219,21 @@ def zimit(args=None): cmd_args.append("--cwd") cmd_args.append(str(temp_root_dir)) + # setup inotify crawler progress watcher + if zimit_args.statsFilename: + watcher = ProgressFileWatcher( + Path(zimit_args.output), Path(zimit_args.statsFilename) + ) + print(f"Writing progress to {watcher.stats_path}") + # update crawler command + cmd_args.append("--statsFilename") + cmd_args.append(str(watcher.crawl_path)) + # update warc2zim command + warc2zim_args.append("-v") + warc2zim_args.append("--progress-file") + warc2zim_args.append(str(watcher.warc2zim_path)) + watcher.watch() + cmd_line = " ".join(cmd_args) print("") @@ -204,7 +289,6 @@ def get_node_cmd_line(args): "scroll", "mobileDevice", "useSitemap", - "statsFilename", ]: value = getattr(args, arg) if value: From 85fad62b614eef0272a039d5860d4d2941b3cd27 Mon Sep 17 00:00:00 2001 From: renaud gaudin Date: Thu, 10 Dec 2020 10:25:30 +0000 Subject: [PATCH 2/2] Updated test to new stats files - verify output of crawl, warc2zim and zimit file - using a simpler tag for CI test image as to not confuse it with public image --- .github/workflows/ci.yaml | 6 +++--- test/integration.py | 12 +++++++++++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index c9d77b7..0b558e6 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -10,10 +10,10 @@ jobs: uses: actions/checkout@v2 - name: build image - run: docker build -t openzim/zimit:dev . + run: docker build -t zimit . - name: run crawl - run: docker run -v $PWD/output:/output openzim/zimit:dev zimit --url http://isago.ml/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice --statsFilename /output/stats.json --keep + run: docker run -v $PWD/output:/output zimit zimit --url http://isago.ml/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice --statsFilename /output/stats.json --keep - name: run integration test suite - run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output openzim/zimit:dev bash -c "pip install pytest; pytest -v ./integration.py" + run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "pip install pytest; pytest -v ./integration.py" diff --git a/test/integration.py b/test/integration.py index b823367..dbdb05c 100644 --- a/test/integration.py +++ b/test/integration.py @@ -45,9 +45,19 @@ def test_user_agent(): def test_stats_output(): - with open("/output/stats.json") as fh: + with open("/output/crawl.json") as fh: assert json.loads(fh.read()) == { "numCrawled": 5, "workersRunning": 0, "total": 5, } + with open("/output/warc2zim.json") as fh: + assert json.loads(fh.read()) == { + "written": 7, + "total": 7, + } + with open("/output/stats.json") as fh: + assert json.loads(fh.read()) == { + "done": 7, + "total": 7, + }