updated to crawler 0.9 (b1)

This commit is contained in:
renaud gaudin 2023-03-24 07:26:10 +00:00
parent b8714d1260
commit b7265b49b6
5 changed files with 17 additions and 20 deletions

View File

@ -14,7 +14,7 @@ jobs:
run: docker build -t zimit .
- name: run crawl
run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "iPhone 11" --statsFilename /output/stats.json --keep
run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
- name: run integration test suite
run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "pip install pytest; pytest -v ./integration.py"

View File

@ -9,10 +9,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed
- Using browsertrix-crawler 0.8.1
- Using browsertrix-crawler 0.9.0-beta.1
- Default and accepted values for `--waitUntil` from crawler's update
- Using `main` warc2zim ⚠️ change before releasing!
- Disabled Chrome updates to prevent incidental inclusion of update data in WARC/ZIM (#172)
### Removed
- `--newContext` from crawler's update
## [1.3.1] - 2023-02-06
### Changed

View File

@ -1,4 +1,4 @@
FROM webrecorder/browsertrix-crawler:0.8.1
FROM webrecorder/browsertrix-crawler:0.9.0-beta.1
LABEL org.opencontainers.image.source https://github.com/openzim/zimit
RUN apt-get update && apt-get install -qqy --no-install-recommends libmagic1 && apt-get clean && rm -rf /var/lib/apt/lists/*

View File

@ -36,7 +36,7 @@ def test_user_agent():
print(record.http_headers)
ua = record.http_headers.get_header("User-Agent")
if ua:
assert "iPhone" in ua
assert "Pixel" in ua
assert ua.endswith(" +Zimit test@example.com")
found = True
@ -55,12 +55,12 @@ def test_stats_output():
}
with open("/output/warc2zim.json") as fh:
assert json.loads(fh.read()) == {
"written": 7,
"total": 7,
"written": 9,
"total": 9,
}
with open("/output/stats.json") as fh:
assert json.loads(fh.read()) == {
"done": 7,
"total": 7,
"done": 9,
"total": 9,
"limit": {"max": 0, "hit": False},
}

View File

@ -115,7 +115,7 @@ class ProgressFileWatcher:
def zimit(args=None):
wait_until_options = ["load", "domcontentloaded", "networkidle0", "networkidle2"]
wait_until_options = ["load", "domcontentloaded", "networkidle"]
wait_until_all = wait_until_options + [
f"{a},{b}" for a, b in itertools.combinations(wait_until_options, 2)
]
@ -132,19 +132,12 @@ def zimit(args=None):
parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers")
parser.add_argument(
"--newContext",
help="The context for each new capture (page, session or browser).",
choices=["page", "session", "browser"],
default="page",
)
parser.add_argument(
"--waitUntil",
help="Puppeteer page.goto() condition to wait for before continuing. One of "
f"{wait_until_options} or a comma-separated combination of those.",
choices=wait_until_all,
default="load,networkidle0",
default="load",
)
parser.add_argument(
@ -206,8 +199,8 @@ def zimit(args=None):
parser.add_argument(
"--mobileDevice",
help="Emulate mobile device by name from "
"https://github.com/puppeteer/puppeteer/blob"
"/main/src/common/DeviceDescriptors.ts",
"https://github.com/microsoft/playwright/blob/main/packages/"
"playwright-core/src/server/deviceDescriptorsSource.json",
)
parser.add_argument(
@ -435,7 +428,6 @@ def get_node_cmd_line(args):
node_cmd = ["crawl"]
for arg in [
"workers",
"newContext",
"waitUntil",
"urlFile",
"depth",