From 3070fe9724c48967438c060c0253b449d9f8776b Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 27 Mar 2024 13:16:00 +0000 Subject: [PATCH 1/3] Rollback previous changes around the presence of a default user-agent - Remove default userAgent value - Set a default mobileDevice - Add back comments explaining that userAgent overrides other settings - Add back logic around the computation of the userAgentSuffix instead of the userAgent - Add new noMobileDevice argument to not set the default mobileDevice --- src/zimit/zimit.py | 35 ++++++++++++++++++++--------------- tests/test_dummy.py | 6 +++--- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index dbf3965..a1d45d8 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -26,11 +26,6 @@ from zimscraperlib.uri import rebuild_uri from zimit.__about__ import __version__ -DEFAULT_USER_AGENT = ( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 " - "(KHTML, like Gecko) Version/17.0 Safari/605.1.15" -) - EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2 EXIT_CODE_CRAWLER_LIMIT_HIT = 11 NORMAL_WARC2ZIM_EXIT_CODE = 100 @@ -230,13 +225,21 @@ def run(raw_args): help="Emulate mobile device by name from " "https://github.com/puppeteer/puppeteer/blob/" "main/packages/puppeteer-core/src/common/Device.ts", + default="Pixel 2", + ) + + parser.add_argument( + "--noMobileDevice", + help="Do not emulate a mobile device (use at your own risk, behavior is" + "uncertain)", + action="store_true", + default=False, ) parser.add_argument( "--userAgent", - help="Override default user-agent with specified value ; --userAgentSuffix is " - "still applied", - default=DEFAULT_USER_AGENT, + help="Override default user-agent with specified value ; --userAgentSuffix and " + "--adminEmail have no effect when this is set", ) parser.add_argument( @@ -384,11 +387,9 @@ def run(raw_args): url = zimit_args.url - user_agent = zimit_args.userAgent - if zimit_args.userAgentSuffix: - user_agent += f" {zimit_args.userAgentSuffix}" + user_agent_suffix = zimit_args.userAgentSuffix if zimit_args.adminEmail: - user_agent += f" {zimit_args.adminEmail}" + user_agent_suffix += f" {zimit_args.adminEmail}" if url: url = get_cleaned_url(url) @@ -443,8 +444,12 @@ def run(raw_args): cmd_args.append("--url") cmd_args.append(url) - cmd_args.append("--userAgent") - cmd_args.append(user_agent) + cmd_args.append("--userAgentSuffix") + cmd_args.append(user_agent_suffix) + + if not zimit_args.noMobileDevice: + cmd_args.append("--mobileDevice") + cmd_args.append(zimit_args.mobileDevice) cmd_args.append("--cwd") cmd_args.append(str(temp_root_dir)) @@ -538,7 +543,7 @@ def get_node_cmd_line(args): "collection", "allowHashUrls", "lang", - "mobileDevice", + "userAgent", "useSitemap", "behaviors", "behaviorTimeout", diff --git a/tests/test_dummy.py b/tests/test_dummy.py index dd89067..54af094 100644 --- a/tests/test_dummy.py +++ b/tests/test_dummy.py @@ -1,6 +1,6 @@ -from zimit.zimit import DEFAULT_USER_AGENT +from zimit.zimit import NORMAL_WARC2ZIM_EXIT_CODE # dummy test, just to have coverage report done -def test_default_user_agent(): - assert DEFAULT_USER_AGENT +def test_something_exists(): + assert NORMAL_WARC2ZIM_EXIT_CODE From e24479945f79113694cb76e14a62e4ddf2c2020e Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 27 Mar 2024 13:18:04 +0000 Subject: [PATCH 2/3] Remove trailing characters when retrieving Browsertrix Crawler version --- CHANGELOG.md | 7 +++++++ src/zimit/zimit.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ac640f3..a4d077b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - New `--version` flag to display Zimit version - New `--logging` flag to adjust Browsertrix Crawler logging (#273) - Use new `--scraper-suffix` flag of warc2zim to enhance ZIM "Scraper" metadata (#275) +- New `--noMobileDevice` CLI argument ### Changed @@ -21,6 +22,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Adopt Python bootstrap conventions - Removed handling of redirects by zimit, they are handled by browsertrix crawler and detected properly by warc2zim - Upgrade to Python 3.12 + upgrade dependencies +- `--userAgent` CLI argument overrides again the `--userAgentSuffix` and `--adminEmail` values +- `--userAgent` CLI arguement is not mandatory anymore + +### Fixed + +- Fix support for Youtube videos (#291) ## [1.6.3] - 2024-01-18 diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index a1d45d8..7c2764a 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -370,7 +370,7 @@ def run(raw_args): except Exception: logger.error("Failed to get Browsertrix crawler version") raise - crawler_version = crawl.stdout + crawler_version = crawl.stdout.strip() logger.info(f"Browsertrix crawler: version {crawler_version}") # pass a scraper suffix to warc2zim so that both zimit, warc2zim and crawler From 728784d6bf143735cf03d05ee2fa52f838404cf3 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 27 Mar 2024 13:22:59 +0000 Subject: [PATCH 3/3] Upgrade Browsertrix Crawler to 1.0.3 --- CHANGELOG.md | 1 + Dockerfile | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a4d077b..81dc91d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Upgrade to Python 3.12 + upgrade dependencies - `--userAgent` CLI argument overrides again the `--userAgentSuffix` and `--adminEmail` values - `--userAgent` CLI arguement is not mandatory anymore +- Upgraded Browsertrix Crawler to 1.0.3 ### Fixed diff --git a/Dockerfile b/Dockerfile index 53306f3..c1731d9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM webrecorder/browsertrix-crawler:1.0.0-beta.6 +FROM webrecorder/browsertrix-crawler:1.0.3 LABEL org.opencontainers.image.source https://github.com/openzim/zimit # add deadsnakes ppa for Python 3.12 on Ubuntu Jammy