upgraded to browsertrix-crawler 0.9.0

This commit is contained in:
renaud gaudin 2023-04-10 13:08:12 +00:00
parent 4f676e37c7
commit 8ecd0a3210
3 changed files with 48 additions and 3 deletions

View File

@ -7,9 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
### Added
- `--title` to set ZIM title
- `--description` to set ZIM description
- New crawler options: `--maxPageLimit`, `--delay`, `--diskUtilization`
### Changed
- Using browsertrix-crawler 0.9.0-beta.2
- Using browsertrix-crawler 0.9.0
- Default and accepted values for `--waitUntil` from crawler's update
- Using `main` warc2zim ⚠️ change before releasing!
- Disabled Chrome updates to prevent incidental inclusion of update data in WARC/ZIM (#172)

View File

@ -1,4 +1,4 @@
FROM webrecorder/browsertrix-crawler:0.9.0-beta.2
FROM webrecorder/browsertrix-crawler:0.9.0
LABEL org.opencontainers.image.source https://github.com/openzim/zimit
RUN apt-get update && apt-get install -qqy --no-install-recommends libmagic1 && apt-get clean && rm -rf /var/lib/apt/lists/*

View File

@ -124,6 +124,8 @@ def zimit(args=None):
)
parser.add_argument("-u", "--url", help="The URL to start crawling from")
parser.add_argument("--title", help="ZIM title")
parser.add_argument("--description", help="ZIM description")
parser.add_argument(
"--urlFile",
@ -155,6 +157,13 @@ def zimit(args=None):
"--limit", help="Limit crawl to this number of pages", type=int, default=0
)
parser.add_argument(
"--maxPageLimit",
help="Maximum pages to crawl, overriding pageLimit if both are set",
type=int,
default=0,
)
parser.add_argument(
"--timeout",
help="Timeout for each page to load (in seconds)",
@ -217,7 +226,8 @@ def zimit(args=None):
parser.add_argument(
"--useSitemap",
help="If set, use the URL as sitemap to get additional URLs for the crawl (usually /sitemap.xml)",
help="If set, use the URL as sitemap to get additional URLs for the crawl "
"(usually /sitemap.xml)",
)
parser.add_argument(
@ -234,6 +244,14 @@ def zimit(args=None):
default=90,
)
parser.add_argument(
"--delay",
help="If >0, amount of time to sleep (in seconds) after behaviors "
"before moving on to next page",
type=int,
default=0,
)
parser.add_argument(
"--profile",
help="Path to tar.gz file which will be extracted "
@ -247,6 +265,14 @@ def zimit(args=None):
default=0,
)
parser.add_argument(
"--diskUtilization",
help="If set, save state and exit if diskutilization "
"exceeds this percentage value",
type=int,
default=90,
)
parser.add_argument(
"--timeLimit",
help="If set, save state and exit after time limit, in seconds",
@ -308,6 +334,14 @@ def zimit(args=None):
if zimit_args.custom_css:
warc2zim_args += ["--custom-css", zimit_args.custom_css]
if zimit_args.title:
warc2zim_args.append("--title")
warc2zim_args.append(zimit_args.title)
if zimit_args.description:
warc2zim_args.append("--description")
warc2zim_args.append(zimit_args.description)
print("----------")
print("Testing warc2zim args")
print("Running: warc2zim " + " ".join(warc2zim_args), flush=True)
@ -430,9 +464,12 @@ def get_node_cmd_line(args):
"workers",
"waitUntil",
"urlFile",
"title",
"description",
"depth",
"extraHops",
"limit",
"maxPageLimit",
"timeout",
"scopeType",
"include",
@ -444,8 +481,10 @@ def get_node_cmd_line(args):
"useSitemap",
"behaviors",
"behaviorTimeout",
"delay",
"profile",
"sizeLimit",
"diskUtilization",
"timeLimit",
"healthCheckPort",
"overwrite",