capture and incorporates limit info from crawl

This commit is contained in:
renaud gaudin 2021-01-29 14:29:37 +00:00
parent 964746481f
commit cfa4b0e7f8
2 changed files with 26 additions and 5 deletions

View File

@ -50,6 +50,7 @@ def test_stats_output():
"numCrawled": 5, "numCrawled": 5,
"workersRunning": 0, "workersRunning": 0,
"total": 5, "total": 5,
"limit": {"max": 0, "hit": False}
} }
with open("/output/warc2zim.json") as fh: with open("/output/warc2zim.json") as fh:
assert json.loads(fh.read()) == { assert json.loads(fh.read()) == {
@ -60,4 +61,5 @@ def test_stats_output():
assert json.loads(fh.read()) == { assert json.loads(fh.read()) == {
"done": 7, "done": 7,
"total": 7, "total": 7,
"limit": {"max": 0, "hit": False}
} }

View File

@ -60,11 +60,29 @@ class ProgressFileWatcher:
ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY) ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY)
ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY) ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY)
def crawl_conv(data): class Limit:
# we consider crawl to be 90% of the workload so total = craw_total * 90% def __init__(self):
return {"done": data["numCrawled"], "total": int(data["total"] / 0.9)} self.max = self.hit = None
def warc2zim_conv(data): @property
def as_dict(self):
return {"max": self.max, "hit": self.hit}
# limit is only reported by crawl but needs to be reported up
limit = Limit()
def crawl_conv(data, limit):
# we consider crawl to be 90% of the workload so total = craw_total * 90%
# limit = {"max": data["limit"]["max"], "hit": data["limit"]["hit"]}
limit.max = data["limit"]["max"]
limit.hit = data["limit"]["hit"]
return {
"done": data["numCrawled"],
"total": int(data["total"] / 0.9),
"limit": limit.as_dict,
}
def warc2zim_conv(data, limit):
# we consider warc2zim to be 10% of the workload so # we consider warc2zim to be 10% of the workload so
# warc2zim_total = 10% and total = 90 + warc2zim_total * 10% # warc2zim_total = 10% and total = 90 + warc2zim_total * 10%
return { return {
@ -73,6 +91,7 @@ class ProgressFileWatcher:
* (0.9 + (float(data["written"]) / data["total"]) / 10) * (0.9 + (float(data["written"]) / data["total"]) / 10)
), ),
"total": data["total"], "total": data["total"],
"limit": limit.as_dict,
} }
for _, _, fpath, _ in ino.event_gen(yield_nones=False): for _, _, fpath, _ in ino.event_gen(yield_nones=False):
@ -82,7 +101,7 @@ class ProgressFileWatcher:
# open input and output separatly as to not clear output on error # open input and output separatly as to not clear output on error
with open(fpath, "r") as ifh: with open(fpath, "r") as ifh:
try: try:
out = func(json.load(ifh)) out = func(json.load(ifh), limit)
except Exception: # nosec except Exception: # nosec
# simply ignore progress update should an error arise # simply ignore progress update should an error arise
# might be malformed input for instance # might be malformed input for instance