mirror of
https://github.com/openzim/zimit.git
synced 2025-09-23 03:52:16 -04:00
Updated to browsertrix-crawler 0.6 and warc2zim 1.4
This commit is contained in:
parent
8b5eeb31c7
commit
1f490ace8f
50
CHANGELOG.md
50
CHANGELOG.md
@ -1,8 +1,46 @@
|
|||||||
# 1.1.5
|
## Changelog
|
||||||
|
|
||||||
|
All notable changes to this project are documented in this file.
|
||||||
|
|
||||||
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||||
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).
|
||||||
|
|
||||||
|
|
||||||
|
## [Unreleased]
|
||||||
|
|
||||||
|
### Added
|
||||||
|
|
||||||
|
- `--urlFile` browsertrix crawler parameter
|
||||||
|
- `--depth` browsertrix crawler parameter
|
||||||
|
- `--extraHops`, parameter
|
||||||
|
- `--collection` browsertrix crawler parameter
|
||||||
|
- `--allowHashUrls` browsertrix crawler parameter
|
||||||
|
- `--userAgentSuffix` browsertrix crawler parameter
|
||||||
|
- `--behaviors`, parameter
|
||||||
|
- `--behaviorTimeout` browsertrix crawler parameter
|
||||||
|
- `--profile` browsertrix crawler parameter
|
||||||
|
- `--sizeLimit` browsertrix crawler parameter
|
||||||
|
- `--timeLimit` browsertrix crawler parameter
|
||||||
|
- `--healthCheckPort`, parameter
|
||||||
|
- `--overwrite` parameter
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
|
||||||
|
- using browsertrix-crawler `0.6.0` and warc2zim `1.4.2`
|
||||||
|
- default WARC location after crawl changed
|
||||||
|
from `collections/capture-*/archive/` to `collections/crawl-*/archive/`
|
||||||
|
|
||||||
|
### Removed
|
||||||
|
|
||||||
|
- `--scroll` browsertrix crawler parameter (see `--behaviors`)
|
||||||
|
- `--scope` browsertrix crawler parameter (see `--scopeType`, `--include` and `--exclude`)
|
||||||
|
|
||||||
|
|
||||||
|
## [1.1.5]
|
||||||
|
|
||||||
- using crawler 0.3.2 and warc2zim 1.3.6
|
- using crawler 0.3.2 and warc2zim 1.3.6
|
||||||
|
|
||||||
# 1.1.4
|
## [1.1.4]
|
||||||
|
|
||||||
- Defaults to `load,networkidle0` for waitUntil param (same as crawler)
|
- Defaults to `load,networkidle0` for waitUntil param (same as crawler)
|
||||||
- Allows setting combinations of values for waitUntil param
|
- Allows setting combinations of values for waitUntil param
|
||||||
@ -11,23 +49,23 @@
|
|||||||
- Warc to zim now written to `{temp_root_dir}/collections/capture-*/archive/` where
|
- Warc to zim now written to `{temp_root_dir}/collections/capture-*/archive/` where
|
||||||
`capture-*` is dynamic and includes the datetime. (from browsertrix-crawler)
|
`capture-*` is dynamic and includes the datetime. (from browsertrix-crawler)
|
||||||
|
|
||||||
# 1.1.3
|
## [1.1.3]
|
||||||
|
|
||||||
- allows same first-level-domain redirects
|
- allows same first-level-domain redirects
|
||||||
- fixed redirects to URL in scope
|
- fixed redirects to URL in scope
|
||||||
- updated crawler to 0.2.0
|
- updated crawler to 0.2.0
|
||||||
- `statsFilename` now informs whether limit was hit or not
|
- `statsFilename` now informs whether limit was hit or not
|
||||||
|
|
||||||
# 1.1.2
|
## [1.1.2]
|
||||||
|
|
||||||
- added support for --custom-css
|
- added support for --custom-css
|
||||||
- added domains block list (dfault)
|
- added domains block list (dfault)
|
||||||
|
|
||||||
# 1.1.1
|
## [1.1.1]
|
||||||
|
|
||||||
- updated browsertrix-crawler to 0.1.4
|
- updated browsertrix-crawler to 0.1.4
|
||||||
- autofetcher script to be injected by defaultDriver to capture srcsets + URLs in dynamically added stylesheets
|
- autofetcher script to be injected by defaultDriver to capture srcsets + URLs in dynamically added stylesheets
|
||||||
|
|
||||||
# 1.0
|
## [1.0]
|
||||||
|
|
||||||
- initial version using browsertrix-crawler:0.1.3 and warc2zim:1.3.3
|
- initial version using browsertrix-crawler:0.1.3 and warc2zim:1.3.3
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
FROM webrecorder/browsertrix-crawler:0.6.0-beta.1
|
FROM webrecorder/browsertrix-crawler:0.6.0
|
||||||
LABEL org.opencontainers.image.source https://github.com/openzim/zimit
|
LABEL org.opencontainers.image.source https://github.com/openzim/zimit
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -qqy --no-install-recommends libmagic1 && apt-get clean && rm -rf /var/lib/apt/lists/*
|
RUN apt-get update && apt-get install -qqy --no-install-recommends libmagic1 && apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
RUN pip3.8 install --no-cache-dir 'requests>=2.24.0' 'inotify==0.2.10' 'tld>=0.12,<0.13' 'warc2zim==1.4.0'
|
RUN pip3.8 install --no-cache-dir 'requests>=2.24.0' 'inotify==0.2.10' 'tld>=0.12,<0.13' 'warc2zim==1.4.3'
|
||||||
|
|
||||||
RUN mkdir -p /output
|
RUN mkdir -p /output
|
||||||
|
|
||||||
|
182
zimit.py
182
zimit.py
@ -125,6 +125,11 @@ def zimit(args=None):
|
|||||||
|
|
||||||
parser.add_argument("-u", "--url", help="The URL to start crawling from")
|
parser.add_argument("-u", "--url", help="The URL to start crawling from")
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--urlFile",
|
||||||
|
help="If set, read a list of seed urls, " "one per line, from the specified",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers")
|
parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers")
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -142,6 +147,17 @@ def zimit(args=None):
|
|||||||
default="load,networkidle0",
|
default="load,networkidle0",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--depth", help="The depth of the crawl for all seeds", type=int, default=-1
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--extraHops",
|
||||||
|
help="Number of extra 'hops' to follow, beyond the current scope",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--limit", help="Limit crawl to this number of pages", type=int, default=0
|
"--limit", help="Limit crawl to this number of pages", type=int, default=0
|
||||||
)
|
)
|
||||||
@ -154,18 +170,107 @@ def zimit(args=None):
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--scope",
|
"--scopeType",
|
||||||
help="Regex of page URLs that should be included in the crawl "
|
help="A predfined scope of the crawl. For more customization, "
|
||||||
"(defaults to the immediate directory of the URL)",
|
"use 'custom' and set scopeIncludeRx regexes",
|
||||||
|
choices=["page", "page-spa", "prefix", "host", "domain", "any", "custom"],
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--exclude", help="Regex of page URLs that should be excluded from the crawl."
|
"--include",
|
||||||
|
help="Regex of page URLs that should be "
|
||||||
|
"included in the crawl (defaults to "
|
||||||
|
"the immediate directory of URL)",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--scroll",
|
"--exclude",
|
||||||
help="If set, will autoscroll to bottom of the page",
|
help="Regex of page URLs that should be excluded from the crawl",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--collection",
|
||||||
|
help="Collection name to crawl to (replay will be accessible "
|
||||||
|
"under this name in pywb preview) instead of crawl-@ts",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--allowHashUrls",
|
||||||
|
help="Allow Hashtag URLs, useful for "
|
||||||
|
"single-page-application crawling or "
|
||||||
|
"when different hashtags load dynamic "
|
||||||
|
"content",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--mobileDevice",
|
||||||
|
help="Emulate mobile device by name from "
|
||||||
|
"https://github.com/puppeteer/puppeteer/blob"
|
||||||
|
"/main/src/common/DeviceDescriptors.ts",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--userAgent",
|
||||||
|
help="Override user-agent with specified",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--userAgentSuffix",
|
||||||
|
help="Append suffix to existing browser user-agent "
|
||||||
|
"(ex: +MyCrawler, info@example.com)",
|
||||||
|
default="+Zimit ",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--useSitemap",
|
||||||
|
help="If set, use the URL as sitemap to get additional URLs for the crawl (usually /sitemap.xml)",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--behaviors",
|
||||||
|
help="Which background behaviors to enable on each page",
|
||||||
|
default="autoplay,autofetch,siteSpecific",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--behaviorTimeout",
|
||||||
|
help="If >0, timeout (in seconds) for in-page behavior will run on each page. "
|
||||||
|
"If 0, a behavior can run until finish",
|
||||||
|
type=int,
|
||||||
|
default=90,
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--profile",
|
||||||
|
help="Path to tar.gz file which will be extracted "
|
||||||
|
"and used as the browser profile",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--sizeLimit",
|
||||||
|
help="If set, save state and exit if size limit exceeds this value",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--timeLimit",
|
||||||
|
help="If set, save state and exit after time limit, in seconds",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--healthCheckPort",
|
||||||
|
help="port to run healthcheck on",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--overwrite",
|
||||||
|
help="overwrite current crawl data: if set, existing collection directory "
|
||||||
|
"will be deleted before crawl is started",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
default=False,
|
default=False,
|
||||||
)
|
)
|
||||||
@ -182,15 +287,6 @@ def zimit(args=None):
|
|||||||
|
|
||||||
parser.add_argument("--adminEmail", help="Admin Email for Zimit crawler")
|
parser.add_argument("--adminEmail", help="Admin Email for Zimit crawler")
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--mobileDevice", help="Crawl as Mobile Device", nargs="?", const="iPhone X"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--useSitemap",
|
|
||||||
help="If set, use the URL as sitemap to get additional URLs for the crawl (usually /sitemap.xml)",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--custom-css",
|
"--custom-css",
|
||||||
help="[warc2zim] Custom CSS file URL/path to inject into all articles",
|
help="[warc2zim] Custom CSS file URL/path to inject into all articles",
|
||||||
@ -211,7 +307,7 @@ def zimit(args=None):
|
|||||||
url = zimit_args.url
|
url = zimit_args.url
|
||||||
|
|
||||||
if url:
|
if url:
|
||||||
url = check_url(url, zimit_args.scope)
|
url = check_url(url, zimit_args.scopeType)
|
||||||
warc2zim_args.append("--url")
|
warc2zim_args.append("--url")
|
||||||
warc2zim_args.append(url)
|
warc2zim_args.append(url)
|
||||||
|
|
||||||
@ -244,7 +340,7 @@ def zimit(args=None):
|
|||||||
cmd_args.append("--url")
|
cmd_args.append("--url")
|
||||||
cmd_args.append(url)
|
cmd_args.append(url)
|
||||||
|
|
||||||
user_agent_suffix = "+Zimit "
|
user_agent_suffix = zimit_args.userAgentSuffix
|
||||||
if zimit_args.adminEmail:
|
if zimit_args.adminEmail:
|
||||||
user_agent_suffix += zimit_args.adminEmail
|
user_agent_suffix += zimit_args.adminEmail
|
||||||
|
|
||||||
@ -277,9 +373,15 @@ def zimit(args=None):
|
|||||||
f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}"
|
f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}"
|
||||||
)
|
)
|
||||||
print(f"Running browsertrix-crawler crawl: {cmd_line}", flush=True)
|
print(f"Running browsertrix-crawler crawl: {cmd_line}", flush=True)
|
||||||
subprocess.run(cmd_args, check=True)
|
crawl = subprocess.run(cmd_args)
|
||||||
|
if crawl.returncode == 11:
|
||||||
|
print("crawl interupted by a limit")
|
||||||
|
elif crawl.returncode != 0:
|
||||||
|
raise subprocess.CalledProcessError(
|
||||||
|
f"returned non-zero exit status {crawl.returncode}"
|
||||||
|
)
|
||||||
|
|
||||||
warc_files = list(temp_root_dir.rglob("collections/capture-*/archive/"))[-1]
|
warc_files = list(temp_root_dir.rglob("collections/crawl-*/archive/"))[-1]
|
||||||
warc2zim_args.append(str(warc_files))
|
warc2zim_args.append(str(warc_files))
|
||||||
|
|
||||||
num_files = sum(1 for e in warc_files.iterdir())
|
num_files = sum(1 for e in warc_files.iterdir())
|
||||||
@ -300,22 +402,21 @@ def check_url(url, scope=None):
|
|||||||
actual_url = resp.url
|
actual_url = resp.url
|
||||||
|
|
||||||
if actual_url != url:
|
if actual_url != url:
|
||||||
# redirect on same domain or same first-level domain
|
if scope in (None, "any"):
|
||||||
if get_fld(url) == get_fld(actual_url):
|
|
||||||
return actual_url
|
return actual_url
|
||||||
|
|
||||||
# is it in scope?
|
print(
|
||||||
if scope:
|
"[WARN] Your URL ({0}) redirects to {1} which {2} on same "
|
||||||
try:
|
"first-level domain. Depending on your scopeType ({3}), "
|
||||||
if re.match(scope, actual_url):
|
"your homepage might be out-of-scope. Please check!".format(
|
||||||
return actual_url
|
url,
|
||||||
except Exception as exc:
|
actual_url,
|
||||||
print(f"failed to parse your scope regexp for url checking: {exc}")
|
"is" if get_fld(url) == get_fld(actual_url) else "is not",
|
||||||
|
scope,
|
||||||
raise ValueError(
|
|
||||||
f"Main page URL ({url}) redirects to out-of-scope domain "
|
|
||||||
f"({actual_url}), cancelling crawl"
|
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return actual_url
|
||||||
|
|
||||||
return url
|
return url
|
||||||
|
|
||||||
@ -326,13 +427,26 @@ def get_node_cmd_line(args):
|
|||||||
"workers",
|
"workers",
|
||||||
"newContext",
|
"newContext",
|
||||||
"waitUntil",
|
"waitUntil",
|
||||||
|
"urlFile",
|
||||||
|
"depth",
|
||||||
|
"extraHops",
|
||||||
"limit",
|
"limit",
|
||||||
"timeout",
|
"timeout",
|
||||||
"scope",
|
"scopeType",
|
||||||
|
"include",
|
||||||
"exclude",
|
"exclude",
|
||||||
"scroll",
|
"collection",
|
||||||
|
"allowHashUrls",
|
||||||
"mobileDevice",
|
"mobileDevice",
|
||||||
|
"userAgent",
|
||||||
"useSitemap",
|
"useSitemap",
|
||||||
|
"behaviors",
|
||||||
|
"behaviorTimeout",
|
||||||
|
"profile",
|
||||||
|
"sizeLimit",
|
||||||
|
"timeLimit",
|
||||||
|
"healthCheckPort",
|
||||||
|
"overwrite",
|
||||||
]:
|
]:
|
||||||
value = getattr(args, arg)
|
value = getattr(args, arg)
|
||||||
if value:
|
if value:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user