User-Agent has a default and is used for check_url

This commit is contained in:
benoit74 2023-10-23 11:45:55 +02:00
parent f22bb9218c
commit 2a317c91e4
No known key found for this signature in database
GPG Key ID: B89606434FC7B530
2 changed files with 28 additions and 11 deletions

View File

@ -10,6 +10,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed ### Changed
- Scraper fails for all HTTP error codes returned when checking URL at startup (#223) - Scraper fails for all HTTP error codes returned when checking URL at startup (#223)
- User-Agent now has a default value (#228)
- Manipulation of spaces with UA suffix and adminEmail has been modified
- Same User-Agent is used for check_url (Python) and Browsertrix crawler (#227)
## [1.5.3] - 2023-10-02 ## [1.5.3] - 2023-10-02

View File

@ -28,6 +28,7 @@ from tld import get_fld
from warc2zim.main import warc2zim from warc2zim.main import warc2zim
from zimscraperlib.uri import rebuild_uri from zimscraperlib.uri import rebuild_uri
DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15"
class ProgressFileWatcher: class ProgressFileWatcher:
def __init__(self, output_dir, stats_path): def __init__(self, output_dir, stats_path):
@ -226,7 +227,8 @@ def zimit(args=None):
parser.add_argument( parser.add_argument(
"--userAgent", "--userAgent",
help="Override user-agent with specified", help="Override default user-agent with specified value ; --userAgentSuffix is still applied",
default=DEFAULT_USER_AGENT
) )
parser.add_argument( parser.add_argument(
@ -344,8 +346,14 @@ def zimit(args=None):
url = zimit_args.url url = zimit_args.url
user_agent = zimit_args.userAgent
if zimit_args.userAgentSuffix:
user_agent += f" {zimit_args.userAgentSuffix}"
if zimit_args.adminEmail:
user_agent += f" {zimit_args.adminEmail}"
if url: if url:
url = check_url(url, zimit_args.scopeType) url = check_url(url, user_agent, zimit_args.scopeType)
warc2zim_args.append("--url") warc2zim_args.append("--url")
warc2zim_args.append(url) warc2zim_args.append(url)
@ -394,12 +402,19 @@ def zimit(args=None):
cmd_args.append("--url") cmd_args.append("--url")
cmd_args.append(url) cmd_args.append(url)
if zimit_args.mobileDevice:
if zimit_args.userAgent != DEFAULT_USER_AGENT:
print("WARNING: --mobileDevice and --userAgent are both set ; userAgent won't be used for browsertrix crawl; only userAgentSuffix and adminEmail will be passed")
user_agent_suffix = zimit_args.userAgentSuffix user_agent_suffix = zimit_args.userAgentSuffix
if zimit_args.adminEmail: if zimit_args.adminEmail:
user_agent_suffix += zimit_args.adminEmail user_agent_suffix += f" {zimit_args.adminEmail}"
cmd_args.append("--userAgentSuffix") cmd_args.append("--userAgentSuffix")
cmd_args.append(user_agent_suffix) cmd_args.append(user_agent_suffix)
else:
cmd_args.append("--userAgent")
cmd_args.append(user_agent)
cmd_args.append("--cwd") cmd_args.append("--cwd")
cmd_args.append(str(temp_root_dir)) cmd_args.append(str(temp_root_dir))
@ -445,11 +460,11 @@ def zimit(args=None):
return warc2zim(warc2zim_args) return warc2zim(warc2zim_args)
def check_url(url, scope=None): def check_url(url, user_agent, scope=None):
url = urllib.parse.urlparse(url) url = urllib.parse.urlparse(url)
try: try:
resp = requests.head( resp = requests.head(
url.geturl(), stream=True, allow_redirects=True, timeout=(12.2, 27) url.geturl(), stream=True, allow_redirects=True, timeout=(12.2, 27), headers={"User-Agent": user_agent}
) )
resp.raise_for_status() resp.raise_for_status()
except requests.exceptions.RequestException as exc: except requests.exceptions.RequestException as exc:
@ -505,7 +520,6 @@ def get_node_cmd_line(args):
"allowHashUrls", "allowHashUrls",
"lang", "lang",
"mobileDevice", "mobileDevice",
"userAgent",
"useSitemap", "useSitemap",
"behaviors", "behaviors",
"behaviorTimeout", "behaviorTimeout",