new crawler folder structure

This commit is contained in:
renaud gaudin 2021-05-12 17:03:48 +00:00
parent 03abf6050a
commit 2e9c129523
2 changed files with 3 additions and 1 deletions

View File

@ -4,6 +4,8 @@
- Allows setting combinations of values for waitUntil param
- Updated warc2zim to 1.3.5
- Updated browsertrix-crawler to 0.3.1
- Warc to zim now written to `{temp_root_dir}/collections/capture-*/archive/` where
`capture-*` is dynamic and includes the datetime. (from browsertrix-crawler)
# 1.1.3

View File

@ -279,7 +279,7 @@ def zimit(args=None):
print(f"Running browsertrix-crawler crawl: {cmd_line}", flush=True)
subprocess.run(cmd_args, check=True)
warc_files = temp_root_dir / "collections" / "capture" / "archive"
warc_files = list(temp_root_dir.rglob("collections/capture-*/archive/"))[-1]
warc2zim_args.append(str(warc_files))
num_files = sum(1 for e in warc_files.iterdir())