mirror of
https://github.com/openzim/zimit.git
synced 2025-09-22 11:22:23 -04:00
upgraded to browsertrix-crawler 0.9.0
This commit is contained in:
parent
4f676e37c7
commit
8ecd0a3210
@ -7,9 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
|
||||
- `--title` to set ZIM title
|
||||
- `--description` to set ZIM description
|
||||
- New crawler options: `--maxPageLimit`, `--delay`, `--diskUtilization`
|
||||
|
||||
### Changed
|
||||
|
||||
- Using browsertrix-crawler 0.9.0-beta.2
|
||||
- Using browsertrix-crawler 0.9.0
|
||||
- Default and accepted values for `--waitUntil` from crawler's update
|
||||
- Using `main` warc2zim ⚠️ change before releasing!
|
||||
- Disabled Chrome updates to prevent incidental inclusion of update data in WARC/ZIM (#172)
|
||||
|
@ -1,4 +1,4 @@
|
||||
FROM webrecorder/browsertrix-crawler:0.9.0-beta.2
|
||||
FROM webrecorder/browsertrix-crawler:0.9.0
|
||||
LABEL org.opencontainers.image.source https://github.com/openzim/zimit
|
||||
|
||||
RUN apt-get update && apt-get install -qqy --no-install-recommends libmagic1 && apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
41
zimit.py
41
zimit.py
@ -124,6 +124,8 @@ def zimit(args=None):
|
||||
)
|
||||
|
||||
parser.add_argument("-u", "--url", help="The URL to start crawling from")
|
||||
parser.add_argument("--title", help="ZIM title")
|
||||
parser.add_argument("--description", help="ZIM description")
|
||||
|
||||
parser.add_argument(
|
||||
"--urlFile",
|
||||
@ -155,6 +157,13 @@ def zimit(args=None):
|
||||
"--limit", help="Limit crawl to this number of pages", type=int, default=0
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--maxPageLimit",
|
||||
help="Maximum pages to crawl, overriding pageLimit if both are set",
|
||||
type=int,
|
||||
default=0,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--timeout",
|
||||
help="Timeout for each page to load (in seconds)",
|
||||
@ -217,7 +226,8 @@ def zimit(args=None):
|
||||
|
||||
parser.add_argument(
|
||||
"--useSitemap",
|
||||
help="If set, use the URL as sitemap to get additional URLs for the crawl (usually /sitemap.xml)",
|
||||
help="If set, use the URL as sitemap to get additional URLs for the crawl "
|
||||
"(usually /sitemap.xml)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
@ -234,6 +244,14 @@ def zimit(args=None):
|
||||
default=90,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--delay",
|
||||
help="If >0, amount of time to sleep (in seconds) after behaviors "
|
||||
"before moving on to next page",
|
||||
type=int,
|
||||
default=0,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--profile",
|
||||
help="Path to tar.gz file which will be extracted "
|
||||
@ -247,6 +265,14 @@ def zimit(args=None):
|
||||
default=0,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--diskUtilization",
|
||||
help="If set, save state and exit if diskutilization "
|
||||
"exceeds this percentage value",
|
||||
type=int,
|
||||
default=90,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--timeLimit",
|
||||
help="If set, save state and exit after time limit, in seconds",
|
||||
@ -308,6 +334,14 @@ def zimit(args=None):
|
||||
if zimit_args.custom_css:
|
||||
warc2zim_args += ["--custom-css", zimit_args.custom_css]
|
||||
|
||||
if zimit_args.title:
|
||||
warc2zim_args.append("--title")
|
||||
warc2zim_args.append(zimit_args.title)
|
||||
|
||||
if zimit_args.description:
|
||||
warc2zim_args.append("--description")
|
||||
warc2zim_args.append(zimit_args.description)
|
||||
|
||||
print("----------")
|
||||
print("Testing warc2zim args")
|
||||
print("Running: warc2zim " + " ".join(warc2zim_args), flush=True)
|
||||
@ -430,9 +464,12 @@ def get_node_cmd_line(args):
|
||||
"workers",
|
||||
"waitUntil",
|
||||
"urlFile",
|
||||
"title",
|
||||
"description",
|
||||
"depth",
|
||||
"extraHops",
|
||||
"limit",
|
||||
"maxPageLimit",
|
||||
"timeout",
|
||||
"scopeType",
|
||||
"include",
|
||||
@ -444,8 +481,10 @@ def get_node_cmd_line(args):
|
||||
"useSitemap",
|
||||
"behaviors",
|
||||
"behaviorTimeout",
|
||||
"delay",
|
||||
"profile",
|
||||
"sizeLimit",
|
||||
"diskUtilization",
|
||||
"timeLimit",
|
||||
"healthCheckPort",
|
||||
"overwrite",
|
||||
|
Loading…
x
Reference in New Issue
Block a user