diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index cd54e5f..57cd1a4 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -14,7 +14,7 @@ jobs: run: docker build -t zimit . - name: run crawl - run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "iPhone 11" --statsFilename /output/stats.json --keep + run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep - name: run integration test suite run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "pip install pytest; pytest -v ./integration.py" diff --git a/CHANGELOG.md b/CHANGELOG.md index 4c48a0d..e9e8aee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,10 +9,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed -- Using browsertrix-crawler 0.8.1 +- Using browsertrix-crawler 0.9.0-beta.1 +- Default and accepted values for `--waitUntil` from crawler's update - Using `main` warc2zim ⚠️ change before releasing! - Disabled Chrome updates to prevent incidental inclusion of update data in WARC/ZIM (#172) +### Removed + +- `--newContext` from crawler's update + ## [1.3.1] - 2023-02-06 ### Changed diff --git a/Dockerfile b/Dockerfile index 0663018..9f6187d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM webrecorder/browsertrix-crawler:0.8.1 +FROM webrecorder/browsertrix-crawler:0.9.0-beta.1 LABEL org.opencontainers.image.source https://github.com/openzim/zimit RUN apt-get update && apt-get install -qqy --no-install-recommends libmagic1 && apt-get clean && rm -rf /var/lib/apt/lists/* diff --git a/test/integration.py b/test/integration.py index 54f4d99..4fbd530 100644 --- a/test/integration.py +++ b/test/integration.py @@ -36,7 +36,7 @@ def test_user_agent(): print(record.http_headers) ua = record.http_headers.get_header("User-Agent") if ua: - assert "iPhone" in ua + assert "Pixel" in ua assert ua.endswith(" +Zimit test@example.com") found = True @@ -55,12 +55,12 @@ def test_stats_output(): } with open("/output/warc2zim.json") as fh: assert json.loads(fh.read()) == { - "written": 7, - "total": 7, + "written": 9, + "total": 9, } with open("/output/stats.json") as fh: assert json.loads(fh.read()) == { - "done": 7, - "total": 7, + "done": 9, + "total": 9, "limit": {"max": 0, "hit": False}, } diff --git a/zimit.py b/zimit.py index f74b307..0e03bed 100755 --- a/zimit.py +++ b/zimit.py @@ -115,7 +115,7 @@ class ProgressFileWatcher: def zimit(args=None): - wait_until_options = ["load", "domcontentloaded", "networkidle0", "networkidle2"] + wait_until_options = ["load", "domcontentloaded", "networkidle"] wait_until_all = wait_until_options + [ f"{a},{b}" for a, b in itertools.combinations(wait_until_options, 2) ] @@ -132,19 +132,12 @@ def zimit(args=None): parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers") - parser.add_argument( - "--newContext", - help="The context for each new capture (page, session or browser).", - choices=["page", "session", "browser"], - default="page", - ) - parser.add_argument( "--waitUntil", help="Puppeteer page.goto() condition to wait for before continuing. One of " f"{wait_until_options} or a comma-separated combination of those.", choices=wait_until_all, - default="load,networkidle0", + default="load", ) parser.add_argument( @@ -206,8 +199,8 @@ def zimit(args=None): parser.add_argument( "--mobileDevice", help="Emulate mobile device by name from " - "https://github.com/puppeteer/puppeteer/blob" - "/main/src/common/DeviceDescriptors.ts", + "https://github.com/microsoft/playwright/blob/main/packages/" + "playwright-core/src/server/deviceDescriptorsSource.json", ) parser.add_argument( @@ -435,7 +428,6 @@ def get_node_cmd_line(args): node_cmd = ["crawl"] for arg in [ "workers", - "newContext", "waitUntil", "urlFile", "depth",