From 2a317c91e4e24476626e4be11f8b80e0b81d7e6f Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 23 Oct 2023 11:45:55 +0200 Subject: [PATCH] User-Agent has a default and is used for check_url --- CHANGELOG.md | 3 +++ zimit.py | 36 +++++++++++++++++++++++++----------- 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ec63fdc..2ddcdda 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Scraper fails for all HTTP error codes returned when checking URL at startup (#223) +- User-Agent now has a default value (#228) +- Manipulation of spaces with UA suffix and adminEmail has been modified +- Same User-Agent is used for check_url (Python) and Browsertrix crawler (#227) ## [1.5.3] - 2023-10-02 diff --git a/zimit.py b/zimit.py index 943f7c0..baed799 100755 --- a/zimit.py +++ b/zimit.py @@ -28,6 +28,7 @@ from tld import get_fld from warc2zim.main import warc2zim from zimscraperlib.uri import rebuild_uri +DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15" class ProgressFileWatcher: def __init__(self, output_dir, stats_path): @@ -226,14 +227,15 @@ def zimit(args=None): parser.add_argument( "--userAgent", - help="Override user-agent with specified", + help="Override default user-agent with specified value ; --userAgentSuffix is still applied", + default=DEFAULT_USER_AGENT ) parser.add_argument( "--userAgentSuffix", help="Append suffix to existing browser user-agent " "(ex: +MyCrawler, info@example.com)", - default="+Zimit ", + default="+Zimit", ) parser.add_argument( @@ -344,8 +346,14 @@ def zimit(args=None): url = zimit_args.url + user_agent = zimit_args.userAgent + if zimit_args.userAgentSuffix: + user_agent += f" {zimit_args.userAgentSuffix}" + if zimit_args.adminEmail: + user_agent += f" {zimit_args.adminEmail}" + if url: - url = check_url(url, zimit_args.scopeType) + url = check_url(url, user_agent, zimit_args.scopeType) warc2zim_args.append("--url") warc2zim_args.append(url) @@ -394,12 +402,19 @@ def zimit(args=None): cmd_args.append("--url") cmd_args.append(url) - user_agent_suffix = zimit_args.userAgentSuffix - if zimit_args.adminEmail: - user_agent_suffix += zimit_args.adminEmail + if zimit_args.mobileDevice: + if zimit_args.userAgent != DEFAULT_USER_AGENT: + print("WARNING: --mobileDevice and --userAgent are both set ; userAgent won't be used for browsertrix crawl; only userAgentSuffix and adminEmail will be passed") - cmd_args.append("--userAgentSuffix") - cmd_args.append(user_agent_suffix) + user_agent_suffix = zimit_args.userAgentSuffix + if zimit_args.adminEmail: + user_agent_suffix += f" {zimit_args.adminEmail}" + + cmd_args.append("--userAgentSuffix") + cmd_args.append(user_agent_suffix) + else: + cmd_args.append("--userAgent") + cmd_args.append(user_agent) cmd_args.append("--cwd") cmd_args.append(str(temp_root_dir)) @@ -445,11 +460,11 @@ def zimit(args=None): return warc2zim(warc2zim_args) -def check_url(url, scope=None): +def check_url(url, user_agent, scope=None): url = urllib.parse.urlparse(url) try: resp = requests.head( - url.geturl(), stream=True, allow_redirects=True, timeout=(12.2, 27) + url.geturl(), stream=True, allow_redirects=True, timeout=(12.2, 27), headers={"User-Agent": user_agent} ) resp.raise_for_status() except requests.exceptions.RequestException as exc: @@ -505,7 +520,6 @@ def get_node_cmd_line(args): "allowHashUrls", "lang", "mobileDevice", - "userAgent", "useSitemap", "behaviors", "behaviorTimeout",