mirror of
https://github.com/openzim/zimit.git
synced 2025-09-23 12:05:13 -04:00
add support for --useSitemap <url> flag to load additional URLs, potentially fixing #34!
reformat
This commit is contained in:
parent
a801a1eef6
commit
c0bb0503b8
@ -4,26 +4,30 @@ import glob
|
||||
import libzim.reader
|
||||
from warcio import ArchiveIterator
|
||||
|
||||
|
||||
def get_zim_article(zimfile, path):
|
||||
zim_fh = libzim.reader.File(zimfile)
|
||||
return zim_fh.get_article(path).content.tobytes()
|
||||
|
||||
|
||||
def test_is_file():
|
||||
""" Ensure ZIM file exists"""
|
||||
assert os.path.isfile("/output/isago.zim")
|
||||
|
||||
def test_zim_main_page():
|
||||
""" Main page specified, http://isago.ml/, was a redirect to https
|
||||
Ensure main page is the redirected page"""
|
||||
|
||||
assert b'"https://isago.ml/"' in get_zim_article("/output/isago.zim", "A/index.html")
|
||||
def test_zim_main_page():
|
||||
"""Main page specified, http://isago.ml/, was a redirect to https
|
||||
Ensure main page is the redirected page"""
|
||||
|
||||
assert b'"https://isago.ml/"' in get_zim_article(
|
||||
"/output/isago.zim", "A/index.html"
|
||||
)
|
||||
|
||||
|
||||
def test_user_agent():
|
||||
""" Test that mobile user agent was used in WARC request records with custom Zimit and email suffix"""
|
||||
|
||||
found = False
|
||||
#result = get_zim_article("/output/isago.zim", "H/isago.ml/")
|
||||
#print(result)
|
||||
for warc in glob.glob("/output/.tmp*/collections/capture/archive/*.warc.gz"):
|
||||
with open(warc, "rb") as fh:
|
||||
for record in ArchiveIterator(fh):
|
||||
|
11
zimit.py
11
zimit.py
@ -89,6 +89,11 @@ def zimit(args=None):
|
||||
"--mobileDevice", help="Crawl as Mobile Device", nargs="?", const="iPhone X"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--useSitemap",
|
||||
help="If set, use the URL as sitemap to get additional URLs for the crawl (usually /sitemap.xml)",
|
||||
)
|
||||
|
||||
zimit_args, warc2zim_args = parser.parse_known_args(args)
|
||||
|
||||
# pass url and output to warc2zim also
|
||||
@ -115,6 +120,7 @@ def zimit(args=None):
|
||||
temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp"))
|
||||
|
||||
if not zimit_args.keep:
|
||||
|
||||
def cleanup():
|
||||
print("")
|
||||
print("----------")
|
||||
@ -142,7 +148,9 @@ def zimit(args=None):
|
||||
|
||||
print("")
|
||||
print("----------")
|
||||
print(f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}")
|
||||
print(
|
||||
f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}"
|
||||
)
|
||||
print(f"Running browsertrix-crawler crawl: {cmd_line}", flush=True)
|
||||
subprocess.run(cmd_args, check=True)
|
||||
|
||||
@ -190,6 +198,7 @@ def get_node_cmd_line(args):
|
||||
"exclude",
|
||||
"scroll",
|
||||
"mobileDevice",
|
||||
"useSitemap",
|
||||
]:
|
||||
value = getattr(args, arg)
|
||||
if value:
|
||||
|
Loading…
x
Reference in New Issue
Block a user