mirror of
https://github.com/openzim/zimit.git
synced 2025-09-24 04:30:11 -04:00
capture and incorporates limit info from crawl
This commit is contained in:
parent
964746481f
commit
cfa4b0e7f8
@ -50,6 +50,7 @@ def test_stats_output():
|
|||||||
"numCrawled": 5,
|
"numCrawled": 5,
|
||||||
"workersRunning": 0,
|
"workersRunning": 0,
|
||||||
"total": 5,
|
"total": 5,
|
||||||
|
"limit": {"max": 0, "hit": False}
|
||||||
}
|
}
|
||||||
with open("/output/warc2zim.json") as fh:
|
with open("/output/warc2zim.json") as fh:
|
||||||
assert json.loads(fh.read()) == {
|
assert json.loads(fh.read()) == {
|
||||||
@ -60,4 +61,5 @@ def test_stats_output():
|
|||||||
assert json.loads(fh.read()) == {
|
assert json.loads(fh.read()) == {
|
||||||
"done": 7,
|
"done": 7,
|
||||||
"total": 7,
|
"total": 7,
|
||||||
|
"limit": {"max": 0, "hit": False}
|
||||||
}
|
}
|
||||||
|
29
zimit.py
29
zimit.py
@ -60,11 +60,29 @@ class ProgressFileWatcher:
|
|||||||
ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY)
|
ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY)
|
||||||
ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY)
|
ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY)
|
||||||
|
|
||||||
def crawl_conv(data):
|
class Limit:
|
||||||
# we consider crawl to be 90% of the workload so total = craw_total * 90%
|
def __init__(self):
|
||||||
return {"done": data["numCrawled"], "total": int(data["total"] / 0.9)}
|
self.max = self.hit = None
|
||||||
|
|
||||||
def warc2zim_conv(data):
|
@property
|
||||||
|
def as_dict(self):
|
||||||
|
return {"max": self.max, "hit": self.hit}
|
||||||
|
|
||||||
|
# limit is only reported by crawl but needs to be reported up
|
||||||
|
limit = Limit()
|
||||||
|
|
||||||
|
def crawl_conv(data, limit):
|
||||||
|
# we consider crawl to be 90% of the workload so total = craw_total * 90%
|
||||||
|
# limit = {"max": data["limit"]["max"], "hit": data["limit"]["hit"]}
|
||||||
|
limit.max = data["limit"]["max"]
|
||||||
|
limit.hit = data["limit"]["hit"]
|
||||||
|
return {
|
||||||
|
"done": data["numCrawled"],
|
||||||
|
"total": int(data["total"] / 0.9),
|
||||||
|
"limit": limit.as_dict,
|
||||||
|
}
|
||||||
|
|
||||||
|
def warc2zim_conv(data, limit):
|
||||||
# we consider warc2zim to be 10% of the workload so
|
# we consider warc2zim to be 10% of the workload so
|
||||||
# warc2zim_total = 10% and total = 90 + warc2zim_total * 10%
|
# warc2zim_total = 10% and total = 90 + warc2zim_total * 10%
|
||||||
return {
|
return {
|
||||||
@ -73,6 +91,7 @@ class ProgressFileWatcher:
|
|||||||
* (0.9 + (float(data["written"]) / data["total"]) / 10)
|
* (0.9 + (float(data["written"]) / data["total"]) / 10)
|
||||||
),
|
),
|
||||||
"total": data["total"],
|
"total": data["total"],
|
||||||
|
"limit": limit.as_dict,
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, _, fpath, _ in ino.event_gen(yield_nones=False):
|
for _, _, fpath, _ in ino.event_gen(yield_nones=False):
|
||||||
@ -82,7 +101,7 @@ class ProgressFileWatcher:
|
|||||||
# open input and output separatly as to not clear output on error
|
# open input and output separatly as to not clear output on error
|
||||||
with open(fpath, "r") as ifh:
|
with open(fpath, "r") as ifh:
|
||||||
try:
|
try:
|
||||||
out = func(json.load(ifh))
|
out = func(json.load(ifh), limit)
|
||||||
except Exception: # nosec
|
except Exception: # nosec
|
||||||
# simply ignore progress update should an error arise
|
# simply ignore progress update should an error arise
|
||||||
# might be malformed input for instance
|
# might be malformed input for instance
|
||||||
|
Loading…
x
Reference in New Issue
Block a user