mirror of
https://github.com/openzim/zimit.git
synced 2025-09-22 03:12:04 -04:00
Merge pull request #229 from openzim/user_agent
Revisit check-url behavior and provide User-Agent a custom default value
This commit is contained in:
commit
e0a4d3ffef
@ -10,6 +10,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
### Changed
|
### Changed
|
||||||
|
|
||||||
- Scraper fails for all HTTP error codes returned when checking URL at startup (#223)
|
- Scraper fails for all HTTP error codes returned when checking URL at startup (#223)
|
||||||
|
- User-Agent now has a default value (#228)
|
||||||
|
- Manipulation of spaces with UA suffix and adminEmail has been modified
|
||||||
|
- Same User-Agent is used for check_url (Python) and Browsertrix crawler (#227)
|
||||||
|
|
||||||
## [1.5.3] - 2023-10-02
|
## [1.5.3] - 2023-10-02
|
||||||
|
|
||||||
|
@ -35,8 +35,11 @@ def test_user_agent():
|
|||||||
if record.rec_type == "request":
|
if record.rec_type == "request":
|
||||||
print(record.http_headers)
|
print(record.http_headers)
|
||||||
ua = record.http_headers.get_header("User-Agent")
|
ua = record.http_headers.get_header("User-Agent")
|
||||||
if ua:
|
# remove 'and ua != "undefined"' once
|
||||||
assert "Pixel" in ua
|
# https://github.com/webrecorder/browsertrix-crawler/pull/420 is
|
||||||
|
# released / used by us
|
||||||
|
if ua and ua != "undefined":
|
||||||
|
assert "Mozilla" in ua
|
||||||
assert ua.endswith(" +Zimit test@example.com")
|
assert ua.endswith(" +Zimit test@example.com")
|
||||||
found = True
|
found = True
|
||||||
|
|
||||||
|
33
zimit.py
33
zimit.py
@ -28,6 +28,7 @@ from tld import get_fld
|
|||||||
from warc2zim.main import warc2zim
|
from warc2zim.main import warc2zim
|
||||||
from zimscraperlib.uri import rebuild_uri
|
from zimscraperlib.uri import rebuild_uri
|
||||||
|
|
||||||
|
DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15"
|
||||||
|
|
||||||
class ProgressFileWatcher:
|
class ProgressFileWatcher:
|
||||||
def __init__(self, output_dir, stats_path):
|
def __init__(self, output_dir, stats_path):
|
||||||
@ -226,14 +227,15 @@ def zimit(args=None):
|
|||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--userAgent",
|
"--userAgent",
|
||||||
help="Override user-agent with specified",
|
help="Override default user-agent with specified value ; --userAgentSuffix is still applied",
|
||||||
|
default=DEFAULT_USER_AGENT
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--userAgentSuffix",
|
"--userAgentSuffix",
|
||||||
help="Append suffix to existing browser user-agent "
|
help="Append suffix to existing browser user-agent "
|
||||||
"(ex: +MyCrawler, info@example.com)",
|
"(ex: +MyCrawler, info@example.com)",
|
||||||
default="+Zimit ",
|
default="+Zimit",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -344,8 +346,14 @@ def zimit(args=None):
|
|||||||
|
|
||||||
url = zimit_args.url
|
url = zimit_args.url
|
||||||
|
|
||||||
|
user_agent = zimit_args.userAgent
|
||||||
|
if zimit_args.userAgentSuffix:
|
||||||
|
user_agent += f" {zimit_args.userAgentSuffix}"
|
||||||
|
if zimit_args.adminEmail:
|
||||||
|
user_agent += f" {zimit_args.adminEmail}"
|
||||||
|
|
||||||
if url:
|
if url:
|
||||||
url = check_url(url, zimit_args.scopeType)
|
url = check_url(url, user_agent, zimit_args.scopeType)
|
||||||
warc2zim_args.append("--url")
|
warc2zim_args.append("--url")
|
||||||
warc2zim_args.append(url)
|
warc2zim_args.append(url)
|
||||||
|
|
||||||
@ -394,12 +402,8 @@ def zimit(args=None):
|
|||||||
cmd_args.append("--url")
|
cmd_args.append("--url")
|
||||||
cmd_args.append(url)
|
cmd_args.append(url)
|
||||||
|
|
||||||
user_agent_suffix = zimit_args.userAgentSuffix
|
cmd_args.append("--userAgent")
|
||||||
if zimit_args.adminEmail:
|
cmd_args.append(user_agent)
|
||||||
user_agent_suffix += zimit_args.adminEmail
|
|
||||||
|
|
||||||
cmd_args.append("--userAgentSuffix")
|
|
||||||
cmd_args.append(user_agent_suffix)
|
|
||||||
|
|
||||||
cmd_args.append("--cwd")
|
cmd_args.append("--cwd")
|
||||||
cmd_args.append(str(temp_root_dir))
|
cmd_args.append(str(temp_root_dir))
|
||||||
@ -445,13 +449,13 @@ def zimit(args=None):
|
|||||||
return warc2zim(warc2zim_args)
|
return warc2zim(warc2zim_args)
|
||||||
|
|
||||||
|
|
||||||
def check_url(url, scope=None):
|
def check_url(url, user_agent, scope=None):
|
||||||
url = urllib.parse.urlparse(url)
|
url = urllib.parse.urlparse(url)
|
||||||
try:
|
try:
|
||||||
resp = requests.head(
|
with requests.get(
|
||||||
url.geturl(), stream=True, allow_redirects=True, timeout=(12.2, 27)
|
url.geturl(), stream=True, allow_redirects=True, timeout=(12.2, 27), headers={"User-Agent": user_agent}
|
||||||
)
|
) as resp:
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
except requests.exceptions.RequestException as exc:
|
except requests.exceptions.RequestException as exc:
|
||||||
print(f"failed to connect to {url.geturl()}: {exc}", flush=True)
|
print(f"failed to connect to {url.geturl()}: {exc}", flush=True)
|
||||||
raise SystemExit(1)
|
raise SystemExit(1)
|
||||||
@ -505,7 +509,6 @@ def get_node_cmd_line(args):
|
|||||||
"allowHashUrls",
|
"allowHashUrls",
|
||||||
"lang",
|
"lang",
|
||||||
"mobileDevice",
|
"mobileDevice",
|
||||||
"userAgent",
|
|
||||||
"useSitemap",
|
"useSitemap",
|
||||||
"behaviors",
|
"behaviors",
|
||||||
"behaviorTimeout",
|
"behaviorTimeout",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user