From a930542af82a6e066c49b3e073082c86eb46ec67 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 14 Nov 2020 19:36:47 +0000 Subject: [PATCH 1/6] mobile + user agent support: - add support for custom user agent suffix +Zimit with email address specifyable via --adminEmail cmd arg #38 - add ability to crawl as mobile device with --mobileDevice flag (default to iPhone X) add integration tests runnable in docker via github actions logging: print temp dir, flush print statements for immediate logging --- .github/workflows/ci.yaml | 6 +++--- Dockerfile | 5 +++-- test/integration.py | 38 ++++++++++++++++++++++++++++++++++++++ zimit.py | 28 +++++++++++++++++++++------- 4 files changed, 65 insertions(+), 12 deletions(-) create mode 100644 test/integration.py diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index fef5e0f..13637b6 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -13,7 +13,7 @@ jobs: run: docker build -t openzim/zimit:dev . - name: run crawl - run: docker run -v $PWD/output:/output openzim/zimit:dev zimit --url http://isago.ml/ --name isago --zim-file isago.zim + run: docker run -v $PWD/output:/output openzim/zimit:dev zimit --url http://isago.ml/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice - - name: ensure zim exists - run: stat ./output/isago.zim + - name: run integration test suite + run: docker run -it -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output openzim/zimit:dev bash -c "pip install pytest; pytest -v ./test/integration.py" diff --git a/Dockerfile b/Dockerfile index 0a78e00..ea89f56 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,10 +1,11 @@ -FROM webrecorder/browsertrix-crawler:0.1.1 +FROM webrecorder/browsertrix-crawler:0.1.2 RUN mkdir -p /output WORKDIR /app -RUN pip install 'warc2zim>=1.3.1' 'requests>=2.24.0' +#RUN pip install 'warc2zim>=1.3.1' 'requests>=2.24.0' +RUN pip install git+https://github.com/openzim/warc2zim.git@replay-update ADD zimit.py /app/ diff --git a/test/integration.py b/test/integration.py new file mode 100644 index 0000000..e73dbbc --- /dev/null +++ b/test/integration.py @@ -0,0 +1,38 @@ +import os +import glob + +import libzim.reader +from warcio import ArchiveIterator + +def get_zim_article(zimfile, path): + zim_fh = libzim.reader.File(zimfile) + return zim_fh.get_article(path).content.tobytes() + +def test_is_file(): + """ Ensure ZIM file exists""" + assert os.path.isfile("/output/isago.zim") + +def test_zim_main_page(): + """ Main page specified, http://isago.ml/, was a redirect to https + Ensure main page is the redirected page""" + + assert b'"https://isago.ml/"' in get_zim_article("/output/isago.zim", "A/index.html") + +def test_user_agent(): + """ Test that mobile user agent was used in WARC request records with custom Zimit and email suffix""" + + #result = get_zim_article("/output/isago.zim", "H/isago.ml/") + #print(result) + for warc in glob.glob("/output/.tmp*/collections/capture/archive/*.warc.gz"): + with open(warc, "rb") as fh: + for record in ArchiveIterator(fh): + if record.rec_type == "request": + print(record.http_headers) + ua = record.http_headers.get_header("User-Agent") + if ua: + assert "iPhone" in ua + assert ua.endswith(" +Zimit test@example.com") + return + + # not found + assert False diff --git a/zimit.py b/zimit.py index 185299d..771f77f 100755 --- a/zimit.py +++ b/zimit.py @@ -83,6 +83,12 @@ def zimit(args=None): "--output", help="Output directory for ZIM and WARC files", default="/output" ) + parser.add_argument("--adminEmail", help="Admin Email for Zimit crawler") + + parser.add_argument( + "--mobileDevice", help="Crawl as Mobile Device", nargs="?", const="iPhone X" + ) + zimit_args, warc2zim_args = parser.parse_known_args(args) # pass url and output to warc2zim also @@ -99,7 +105,7 @@ def zimit(args=None): print("----------") print("Testing warc2zim args") - print("Running: warc2zim " + " ".join(warc2zim_args)) + print("Running: warc2zim " + " ".join(warc2zim_args), flush=True) res = warc2zim(warc2zim_args) if res != 100: print("Exiting, invalid warc2zim params") @@ -109,11 +115,10 @@ def zimit(args=None): temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp")) if not zimit_args.keep: - def cleanup(): print("") print("----------") - print(f"Cleanup, removing temp dir: {temp_root_dir}") + print(f"Cleanup, removing temp dir: {temp_root_dir}", flush=True) shutil.rmtree(temp_root_dir) atexit.register(cleanup) @@ -123,6 +128,13 @@ def zimit(args=None): cmd_args.append("--url") cmd_args.append(url) + user_agent_suffix = "+Zimit " + if zimit_args.adminEmail: + user_agent_suffix += zimit_args.adminEmail + + cmd_args.append("--userAgentSuffix") + cmd_args.append(user_agent_suffix) + cmd_args.append("--cwd") cmd_args.append(str(temp_root_dir)) @@ -130,7 +142,8 @@ def zimit(args=None): print("") print("----------") - print(f"running browsertrix-crawler crawl: {cmd_line}") + print(f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}") + print(f"Running browsertrix-crawler crawl: {cmd_line}", flush=True) subprocess.run(cmd_args, check=True) warc_files = temp_root_dir / "collections" / "capture" / "archive" @@ -140,7 +153,7 @@ def zimit(args=None): print("") print("----------") - print(f"Processing {num_files} WARC files to ZIM") + print(f"Processing {num_files} WARC files to ZIM", flush=True) return warc2zim(warc2zim_args) @@ -149,7 +162,7 @@ def check_url(url): try: resp = requests.head(url, stream=True, allow_redirects=True, timeout=10) except requests.exceptions.RequestException as exc: - print(f"failed to connect to {url}: {exc}") + print(f"failed to connect to {url}: {exc}", flush=True) raise SystemExit(1) actual_url = resp.url @@ -176,6 +189,7 @@ def get_node_cmd_line(args): "scope", "exclude", "scroll", + "mobileDevice", ]: value = getattr(args, arg) if value: @@ -191,7 +205,7 @@ def sigint_handler(*args): print("") print("SIGINT/SIGTERM received, stopping zimit") print("") - print("") + print("", flush=True) sys.exit(3) From 82f0fae9595c15a8d66552bea3442916455c4b59 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 14 Nov 2020 20:27:43 +0000 Subject: [PATCH 2/6] update to warc2zim 1.3.2 fix ci test command --- .github/workflows/ci.yaml | 2 +- Dockerfile | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 13637b6..262fa03 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -16,4 +16,4 @@ jobs: run: docker run -v $PWD/output:/output openzim/zimit:dev zimit --url http://isago.ml/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice - name: run integration test suite - run: docker run -it -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output openzim/zimit:dev bash -c "pip install pytest; pytest -v ./test/integration.py" + run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output openzim/zimit:dev bash -c "pip install pytest; pytest -v ./test/integration.py" diff --git a/Dockerfile b/Dockerfile index ea89f56..5d7c8b3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,8 +4,7 @@ RUN mkdir -p /output WORKDIR /app -#RUN pip install 'warc2zim>=1.3.1' 'requests>=2.24.0' -RUN pip install git+https://github.com/openzim/warc2zim.git@replay-update +RUN pip install 'warc2zim>=1.3.2' 'requests>=2.24.0' ADD zimit.py /app/ From 5e4b3d80b361405c63d8902ec144c8bb5ec3376f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 14 Nov 2020 20:30:15 +0000 Subject: [PATCH 3/6] ci: path fix --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 262fa03..8181207 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -16,4 +16,4 @@ jobs: run: docker run -v $PWD/output:/output openzim/zimit:dev zimit --url http://isago.ml/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice - name: run integration test suite - run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output openzim/zimit:dev bash -c "pip install pytest; pytest -v ./test/integration.py" + run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output openzim/zimit:dev bash -c "pip install pytest; pytest -v ./integration.py" From 4723376ebc9051ddbae3f362bbcd67be3c2e158c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 14 Nov 2020 20:33:36 +0000 Subject: [PATCH 4/6] ci: add --keep to keep warc files --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 8181207..4fcf517 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -13,7 +13,7 @@ jobs: run: docker build -t openzim/zimit:dev . - name: run crawl - run: docker run -v $PWD/output:/output openzim/zimit:dev zimit --url http://isago.ml/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice + run: docker run -v $PWD/output:/output openzim/zimit:dev zimit --url http://isago.ml/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice --keep - name: run integration test suite run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output openzim/zimit:dev bash -c "pip install pytest; pytest -v ./integration.py" From a801a1eef6e5b31d30d93e6fc26515ff6867b85f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 14 Nov 2020 20:50:03 +0000 Subject: [PATCH 5/6] ci: improve tests, validate all UA, and check for at least one found --- test/integration.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/integration.py b/test/integration.py index e73dbbc..f890ca0 100644 --- a/test/integration.py +++ b/test/integration.py @@ -21,6 +21,7 @@ def test_zim_main_page(): def test_user_agent(): """ Test that mobile user agent was used in WARC request records with custom Zimit and email suffix""" + found = False #result = get_zim_article("/output/isago.zim", "H/isago.ml/") #print(result) for warc in glob.glob("/output/.tmp*/collections/capture/archive/*.warc.gz"): @@ -32,7 +33,7 @@ def test_user_agent(): if ua: assert "iPhone" in ua assert ua.endswith(" +Zimit test@example.com") - return + found = True - # not found - assert False + # should find at least one + assert found From c0bb0503b8be55c0fd10e60535034b58101f8643 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 14 Nov 2020 22:01:36 +0000 Subject: [PATCH 6/6] add support for --useSitemap flag to load additional URLs, potentially fixing #34! reformat --- test/integration.py | 16 ++++++++++------ zimit.py | 11 ++++++++++- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/test/integration.py b/test/integration.py index f890ca0..cbc9ecc 100644 --- a/test/integration.py +++ b/test/integration.py @@ -4,26 +4,30 @@ import glob import libzim.reader from warcio import ArchiveIterator + def get_zim_article(zimfile, path): zim_fh = libzim.reader.File(zimfile) return zim_fh.get_article(path).content.tobytes() + def test_is_file(): """ Ensure ZIM file exists""" assert os.path.isfile("/output/isago.zim") -def test_zim_main_page(): - """ Main page specified, http://isago.ml/, was a redirect to https - Ensure main page is the redirected page""" - assert b'"https://isago.ml/"' in get_zim_article("/output/isago.zim", "A/index.html") +def test_zim_main_page(): + """Main page specified, http://isago.ml/, was a redirect to https + Ensure main page is the redirected page""" + + assert b'"https://isago.ml/"' in get_zim_article( + "/output/isago.zim", "A/index.html" + ) + def test_user_agent(): """ Test that mobile user agent was used in WARC request records with custom Zimit and email suffix""" found = False - #result = get_zim_article("/output/isago.zim", "H/isago.ml/") - #print(result) for warc in glob.glob("/output/.tmp*/collections/capture/archive/*.warc.gz"): with open(warc, "rb") as fh: for record in ArchiveIterator(fh): diff --git a/zimit.py b/zimit.py index 771f77f..fe06355 100755 --- a/zimit.py +++ b/zimit.py @@ -89,6 +89,11 @@ def zimit(args=None): "--mobileDevice", help="Crawl as Mobile Device", nargs="?", const="iPhone X" ) + parser.add_argument( + "--useSitemap", + help="If set, use the URL as sitemap to get additional URLs for the crawl (usually /sitemap.xml)", + ) + zimit_args, warc2zim_args = parser.parse_known_args(args) # pass url and output to warc2zim also @@ -115,6 +120,7 @@ def zimit(args=None): temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp")) if not zimit_args.keep: + def cleanup(): print("") print("----------") @@ -142,7 +148,9 @@ def zimit(args=None): print("") print("----------") - print(f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}") + print( + f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}" + ) print(f"Running browsertrix-crawler crawl: {cmd_line}", flush=True) subprocess.run(cmd_args, check=True) @@ -190,6 +198,7 @@ def get_node_cmd_line(args): "exclude", "scroll", "mobileDevice", + "useSitemap", ]: value = getattr(args, arg) if value: