diff --git a/test/integration.py b/test/integration.py index f890ca0..cbc9ecc 100644 --- a/test/integration.py +++ b/test/integration.py @@ -4,26 +4,30 @@ import glob import libzim.reader from warcio import ArchiveIterator + def get_zim_article(zimfile, path): zim_fh = libzim.reader.File(zimfile) return zim_fh.get_article(path).content.tobytes() + def test_is_file(): """ Ensure ZIM file exists""" assert os.path.isfile("/output/isago.zim") -def test_zim_main_page(): - """ Main page specified, http://isago.ml/, was a redirect to https - Ensure main page is the redirected page""" - assert b'"https://isago.ml/"' in get_zim_article("/output/isago.zim", "A/index.html") +def test_zim_main_page(): + """Main page specified, http://isago.ml/, was a redirect to https + Ensure main page is the redirected page""" + + assert b'"https://isago.ml/"' in get_zim_article( + "/output/isago.zim", "A/index.html" + ) + def test_user_agent(): """ Test that mobile user agent was used in WARC request records with custom Zimit and email suffix""" found = False - #result = get_zim_article("/output/isago.zim", "H/isago.ml/") - #print(result) for warc in glob.glob("/output/.tmp*/collections/capture/archive/*.warc.gz"): with open(warc, "rb") as fh: for record in ArchiveIterator(fh): diff --git a/zimit.py b/zimit.py index 771f77f..fe06355 100755 --- a/zimit.py +++ b/zimit.py @@ -89,6 +89,11 @@ def zimit(args=None): "--mobileDevice", help="Crawl as Mobile Device", nargs="?", const="iPhone X" ) + parser.add_argument( + "--useSitemap", + help="If set, use the URL as sitemap to get additional URLs for the crawl (usually /sitemap.xml)", + ) + zimit_args, warc2zim_args = parser.parse_known_args(args) # pass url and output to warc2zim also @@ -115,6 +120,7 @@ def zimit(args=None): temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp")) if not zimit_args.keep: + def cleanup(): print("") print("----------") @@ -142,7 +148,9 @@ def zimit(args=None): print("") print("----------") - print(f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}") + print( + f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}" + ) print(f"Running browsertrix-crawler crawl: {cmd_line}", flush=True) subprocess.run(cmd_args, check=True) @@ -190,6 +198,7 @@ def get_node_cmd_line(args): "exclude", "scroll", "mobileDevice", + "useSitemap", ]: value = getattr(args, arg) if value: