add support for --useSitemap <url> flag to load additional URLs, potentially fixing #34!

reformat
This commit is contained in:
Ilya Kreymer 2020-11-14 22:01:36 +00:00
parent a801a1eef6
commit c0bb0503b8
2 changed files with 20 additions and 7 deletions

View File

@ -4,26 +4,30 @@ import glob
import libzim.reader
from warcio import ArchiveIterator
def get_zim_article(zimfile, path):
zim_fh = libzim.reader.File(zimfile)
return zim_fh.get_article(path).content.tobytes()
def test_is_file():
""" Ensure ZIM file exists"""
assert os.path.isfile("/output/isago.zim")
def test_zim_main_page():
""" Main page specified, http://isago.ml/, was a redirect to https
Ensure main page is the redirected page"""
assert b'"https://isago.ml/"' in get_zim_article("/output/isago.zim", "A/index.html")
def test_zim_main_page():
"""Main page specified, http://isago.ml/, was a redirect to https
Ensure main page is the redirected page"""
assert b'"https://isago.ml/"' in get_zim_article(
"/output/isago.zim", "A/index.html"
)
def test_user_agent():
""" Test that mobile user agent was used in WARC request records with custom Zimit and email suffix"""
found = False
#result = get_zim_article("/output/isago.zim", "H/isago.ml/")
#print(result)
for warc in glob.glob("/output/.tmp*/collections/capture/archive/*.warc.gz"):
with open(warc, "rb") as fh:
for record in ArchiveIterator(fh):

View File

@ -89,6 +89,11 @@ def zimit(args=None):
"--mobileDevice", help="Crawl as Mobile Device", nargs="?", const="iPhone X"
)
parser.add_argument(
"--useSitemap",
help="If set, use the URL as sitemap to get additional URLs for the crawl (usually /sitemap.xml)",
)
zimit_args, warc2zim_args = parser.parse_known_args(args)
# pass url and output to warc2zim also
@ -115,6 +120,7 @@ def zimit(args=None):
temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp"))
if not zimit_args.keep:
def cleanup():
print("")
print("----------")
@ -142,7 +148,9 @@ def zimit(args=None):
print("")
print("----------")
print(f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}")
print(
f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}"
)
print(f"Running browsertrix-crawler crawl: {cmd_line}", flush=True)
subprocess.run(cmd_args, check=True)
@ -190,6 +198,7 @@ def get_node_cmd_line(args):
"exclude",
"scroll",
"mobileDevice",
"useSitemap",
]:
value = getattr(args, arg)
if value: