add support for --useSitemap <url> flag to load additional URLs, potentially fixing #34!

reformat
This commit is contained in:
Ilya Kreymer 2020-11-14 22:01:36 +00:00
parent a801a1eef6
commit c0bb0503b8
2 changed files with 20 additions and 7 deletions

View File

@ -4,26 +4,30 @@ import glob
import libzim.reader import libzim.reader
from warcio import ArchiveIterator from warcio import ArchiveIterator
def get_zim_article(zimfile, path): def get_zim_article(zimfile, path):
zim_fh = libzim.reader.File(zimfile) zim_fh = libzim.reader.File(zimfile)
return zim_fh.get_article(path).content.tobytes() return zim_fh.get_article(path).content.tobytes()
def test_is_file(): def test_is_file():
""" Ensure ZIM file exists""" """ Ensure ZIM file exists"""
assert os.path.isfile("/output/isago.zim") assert os.path.isfile("/output/isago.zim")
def test_zim_main_page():
""" Main page specified, http://isago.ml/, was a redirect to https
Ensure main page is the redirected page"""
assert b'"https://isago.ml/"' in get_zim_article("/output/isago.zim", "A/index.html") def test_zim_main_page():
"""Main page specified, http://isago.ml/, was a redirect to https
Ensure main page is the redirected page"""
assert b'"https://isago.ml/"' in get_zim_article(
"/output/isago.zim", "A/index.html"
)
def test_user_agent(): def test_user_agent():
""" Test that mobile user agent was used in WARC request records with custom Zimit and email suffix""" """ Test that mobile user agent was used in WARC request records with custom Zimit and email suffix"""
found = False found = False
#result = get_zim_article("/output/isago.zim", "H/isago.ml/")
#print(result)
for warc in glob.glob("/output/.tmp*/collections/capture/archive/*.warc.gz"): for warc in glob.glob("/output/.tmp*/collections/capture/archive/*.warc.gz"):
with open(warc, "rb") as fh: with open(warc, "rb") as fh:
for record in ArchiveIterator(fh): for record in ArchiveIterator(fh):

View File

@ -89,6 +89,11 @@ def zimit(args=None):
"--mobileDevice", help="Crawl as Mobile Device", nargs="?", const="iPhone X" "--mobileDevice", help="Crawl as Mobile Device", nargs="?", const="iPhone X"
) )
parser.add_argument(
"--useSitemap",
help="If set, use the URL as sitemap to get additional URLs for the crawl (usually /sitemap.xml)",
)
zimit_args, warc2zim_args = parser.parse_known_args(args) zimit_args, warc2zim_args = parser.parse_known_args(args)
# pass url and output to warc2zim also # pass url and output to warc2zim also
@ -115,6 +120,7 @@ def zimit(args=None):
temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp")) temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp"))
if not zimit_args.keep: if not zimit_args.keep:
def cleanup(): def cleanup():
print("") print("")
print("----------") print("----------")
@ -142,7 +148,9 @@ def zimit(args=None):
print("") print("")
print("----------") print("----------")
print(f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}") print(
f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}"
)
print(f"Running browsertrix-crawler crawl: {cmd_line}", flush=True) print(f"Running browsertrix-crawler crawl: {cmd_line}", flush=True)
subprocess.run(cmd_args, check=True) subprocess.run(cmd_args, check=True)
@ -190,6 +198,7 @@ def get_node_cmd_line(args):
"exclude", "exclude",
"scroll", "scroll",
"mobileDevice", "mobileDevice",
"useSitemap",
]: ]:
value = getattr(args, arg) value = getattr(args, arg)
if value: if value: