mirror of
https://github.com/openzim/zimit.git
synced 2025-09-22 11:22:23 -04:00
updated to crawler 0.9 (b1)
This commit is contained in:
parent
b8714d1260
commit
b7265b49b6
2
.github/workflows/ci.yaml
vendored
2
.github/workflows/ci.yaml
vendored
@ -14,7 +14,7 @@ jobs:
|
||||
run: docker build -t zimit .
|
||||
|
||||
- name: run crawl
|
||||
run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "iPhone 11" --statsFilename /output/stats.json --keep
|
||||
run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
|
||||
|
||||
- name: run integration test suite
|
||||
run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "pip install pytest; pytest -v ./integration.py"
|
||||
|
@ -9,10 +9,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
### Changed
|
||||
|
||||
- Using browsertrix-crawler 0.8.1
|
||||
- Using browsertrix-crawler 0.9.0-beta.1
|
||||
- Default and accepted values for `--waitUntil` from crawler's update
|
||||
- Using `main` warc2zim ⚠️ change before releasing!
|
||||
- Disabled Chrome updates to prevent incidental inclusion of update data in WARC/ZIM (#172)
|
||||
|
||||
### Removed
|
||||
|
||||
- `--newContext` from crawler's update
|
||||
|
||||
## [1.3.1] - 2023-02-06
|
||||
|
||||
### Changed
|
||||
|
@ -1,4 +1,4 @@
|
||||
FROM webrecorder/browsertrix-crawler:0.8.1
|
||||
FROM webrecorder/browsertrix-crawler:0.9.0-beta.1
|
||||
LABEL org.opencontainers.image.source https://github.com/openzim/zimit
|
||||
|
||||
RUN apt-get update && apt-get install -qqy --no-install-recommends libmagic1 && apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
@ -36,7 +36,7 @@ def test_user_agent():
|
||||
print(record.http_headers)
|
||||
ua = record.http_headers.get_header("User-Agent")
|
||||
if ua:
|
||||
assert "iPhone" in ua
|
||||
assert "Pixel" in ua
|
||||
assert ua.endswith(" +Zimit test@example.com")
|
||||
found = True
|
||||
|
||||
@ -55,12 +55,12 @@ def test_stats_output():
|
||||
}
|
||||
with open("/output/warc2zim.json") as fh:
|
||||
assert json.loads(fh.read()) == {
|
||||
"written": 7,
|
||||
"total": 7,
|
||||
"written": 9,
|
||||
"total": 9,
|
||||
}
|
||||
with open("/output/stats.json") as fh:
|
||||
assert json.loads(fh.read()) == {
|
||||
"done": 7,
|
||||
"total": 7,
|
||||
"done": 9,
|
||||
"total": 9,
|
||||
"limit": {"max": 0, "hit": False},
|
||||
}
|
||||
|
16
zimit.py
16
zimit.py
@ -115,7 +115,7 @@ class ProgressFileWatcher:
|
||||
|
||||
|
||||
def zimit(args=None):
|
||||
wait_until_options = ["load", "domcontentloaded", "networkidle0", "networkidle2"]
|
||||
wait_until_options = ["load", "domcontentloaded", "networkidle"]
|
||||
wait_until_all = wait_until_options + [
|
||||
f"{a},{b}" for a, b in itertools.combinations(wait_until_options, 2)
|
||||
]
|
||||
@ -132,19 +132,12 @@ def zimit(args=None):
|
||||
|
||||
parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers")
|
||||
|
||||
parser.add_argument(
|
||||
"--newContext",
|
||||
help="The context for each new capture (page, session or browser).",
|
||||
choices=["page", "session", "browser"],
|
||||
default="page",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--waitUntil",
|
||||
help="Puppeteer page.goto() condition to wait for before continuing. One of "
|
||||
f"{wait_until_options} or a comma-separated combination of those.",
|
||||
choices=wait_until_all,
|
||||
default="load,networkidle0",
|
||||
default="load",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
@ -206,8 +199,8 @@ def zimit(args=None):
|
||||
parser.add_argument(
|
||||
"--mobileDevice",
|
||||
help="Emulate mobile device by name from "
|
||||
"https://github.com/puppeteer/puppeteer/blob"
|
||||
"/main/src/common/DeviceDescriptors.ts",
|
||||
"https://github.com/microsoft/playwright/blob/main/packages/"
|
||||
"playwright-core/src/server/deviceDescriptorsSource.json",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
@ -435,7 +428,6 @@ def get_node_cmd_line(args):
|
||||
node_cmd = ["crawl"]
|
||||
for arg in [
|
||||
"workers",
|
||||
"newContext",
|
||||
"waitUntil",
|
||||
"urlFile",
|
||||
"depth",
|
||||
|
Loading…
x
Reference in New Issue
Block a user