mirror of
https://github.com/openzim/zimit.git
synced 2025-09-24 04:30:11 -04:00
Merge pull request #292 from openzim/ua_not_mandatory
Change crawler default settings around userAgent and mobileDevice
This commit is contained in:
commit
f637c3fccc
@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
- New `--version` flag to display Zimit version
|
- New `--version` flag to display Zimit version
|
||||||
- New `--logging` flag to adjust Browsertrix Crawler logging (#273)
|
- New `--logging` flag to adjust Browsertrix Crawler logging (#273)
|
||||||
- Use new `--scraper-suffix` flag of warc2zim to enhance ZIM "Scraper" metadata (#275)
|
- Use new `--scraper-suffix` flag of warc2zim to enhance ZIM "Scraper" metadata (#275)
|
||||||
|
- New `--noMobileDevice` CLI argument
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
|
|
||||||
@ -21,6 +22,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
- Adopt Python bootstrap conventions
|
- Adopt Python bootstrap conventions
|
||||||
- Removed handling of redirects by zimit, they are handled by browsertrix crawler and detected properly by warc2zim
|
- Removed handling of redirects by zimit, they are handled by browsertrix crawler and detected properly by warc2zim
|
||||||
- Upgrade to Python 3.12 + upgrade dependencies
|
- Upgrade to Python 3.12 + upgrade dependencies
|
||||||
|
- `--userAgent` CLI argument overrides again the `--userAgentSuffix` and `--adminEmail` values
|
||||||
|
- `--userAgent` CLI arguement is not mandatory anymore
|
||||||
|
- Upgraded Browsertrix Crawler to 1.0.3
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
|
||||||
|
- Fix support for Youtube videos (#291)
|
||||||
|
|
||||||
## [1.6.3] - 2024-01-18
|
## [1.6.3] - 2024-01-18
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
FROM webrecorder/browsertrix-crawler:1.0.0-beta.6
|
FROM webrecorder/browsertrix-crawler:1.0.3
|
||||||
LABEL org.opencontainers.image.source https://github.com/openzim/zimit
|
LABEL org.opencontainers.image.source https://github.com/openzim/zimit
|
||||||
|
|
||||||
# add deadsnakes ppa for Python 3.12 on Ubuntu Jammy
|
# add deadsnakes ppa for Python 3.12 on Ubuntu Jammy
|
||||||
|
@ -26,11 +26,6 @@ from zimscraperlib.uri import rebuild_uri
|
|||||||
|
|
||||||
from zimit.__about__ import __version__
|
from zimit.__about__ import __version__
|
||||||
|
|
||||||
DEFAULT_USER_AGENT = (
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 "
|
|
||||||
"(KHTML, like Gecko) Version/17.0 Safari/605.1.15"
|
|
||||||
)
|
|
||||||
|
|
||||||
EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
|
EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
|
||||||
EXIT_CODE_CRAWLER_LIMIT_HIT = 11
|
EXIT_CODE_CRAWLER_LIMIT_HIT = 11
|
||||||
NORMAL_WARC2ZIM_EXIT_CODE = 100
|
NORMAL_WARC2ZIM_EXIT_CODE = 100
|
||||||
@ -230,13 +225,21 @@ def run(raw_args):
|
|||||||
help="Emulate mobile device by name from "
|
help="Emulate mobile device by name from "
|
||||||
"https://github.com/puppeteer/puppeteer/blob/"
|
"https://github.com/puppeteer/puppeteer/blob/"
|
||||||
"main/packages/puppeteer-core/src/common/Device.ts",
|
"main/packages/puppeteer-core/src/common/Device.ts",
|
||||||
|
default="Pixel 2",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--noMobileDevice",
|
||||||
|
help="Do not emulate a mobile device (use at your own risk, behavior is"
|
||||||
|
"uncertain)",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--userAgent",
|
"--userAgent",
|
||||||
help="Override default user-agent with specified value ; --userAgentSuffix is "
|
help="Override default user-agent with specified value ; --userAgentSuffix and "
|
||||||
"still applied",
|
"--adminEmail have no effect when this is set",
|
||||||
default=DEFAULT_USER_AGENT,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -367,7 +370,7 @@ def run(raw_args):
|
|||||||
except Exception:
|
except Exception:
|
||||||
logger.error("Failed to get Browsertrix crawler version")
|
logger.error("Failed to get Browsertrix crawler version")
|
||||||
raise
|
raise
|
||||||
crawler_version = crawl.stdout
|
crawler_version = crawl.stdout.strip()
|
||||||
logger.info(f"Browsertrix crawler: version {crawler_version}")
|
logger.info(f"Browsertrix crawler: version {crawler_version}")
|
||||||
|
|
||||||
# pass a scraper suffix to warc2zim so that both zimit, warc2zim and crawler
|
# pass a scraper suffix to warc2zim so that both zimit, warc2zim and crawler
|
||||||
@ -384,11 +387,9 @@ def run(raw_args):
|
|||||||
|
|
||||||
url = zimit_args.url
|
url = zimit_args.url
|
||||||
|
|
||||||
user_agent = zimit_args.userAgent
|
user_agent_suffix = zimit_args.userAgentSuffix
|
||||||
if zimit_args.userAgentSuffix:
|
|
||||||
user_agent += f" {zimit_args.userAgentSuffix}"
|
|
||||||
if zimit_args.adminEmail:
|
if zimit_args.adminEmail:
|
||||||
user_agent += f" {zimit_args.adminEmail}"
|
user_agent_suffix += f" {zimit_args.adminEmail}"
|
||||||
|
|
||||||
if url:
|
if url:
|
||||||
url = get_cleaned_url(url)
|
url = get_cleaned_url(url)
|
||||||
@ -443,8 +444,12 @@ def run(raw_args):
|
|||||||
cmd_args.append("--url")
|
cmd_args.append("--url")
|
||||||
cmd_args.append(url)
|
cmd_args.append(url)
|
||||||
|
|
||||||
cmd_args.append("--userAgent")
|
cmd_args.append("--userAgentSuffix")
|
||||||
cmd_args.append(user_agent)
|
cmd_args.append(user_agent_suffix)
|
||||||
|
|
||||||
|
if not zimit_args.noMobileDevice:
|
||||||
|
cmd_args.append("--mobileDevice")
|
||||||
|
cmd_args.append(zimit_args.mobileDevice)
|
||||||
|
|
||||||
cmd_args.append("--cwd")
|
cmd_args.append("--cwd")
|
||||||
cmd_args.append(str(temp_root_dir))
|
cmd_args.append(str(temp_root_dir))
|
||||||
@ -538,7 +543,7 @@ def get_node_cmd_line(args):
|
|||||||
"collection",
|
"collection",
|
||||||
"allowHashUrls",
|
"allowHashUrls",
|
||||||
"lang",
|
"lang",
|
||||||
"mobileDevice",
|
"userAgent",
|
||||||
"useSitemap",
|
"useSitemap",
|
||||||
"behaviors",
|
"behaviors",
|
||||||
"behaviorTimeout",
|
"behaviorTimeout",
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
from zimit.zimit import DEFAULT_USER_AGENT
|
from zimit.zimit import NORMAL_WARC2ZIM_EXIT_CODE
|
||||||
|
|
||||||
|
|
||||||
# dummy test, just to have coverage report done
|
# dummy test, just to have coverage report done
|
||||||
def test_default_user_agent():
|
def test_something_exists():
|
||||||
assert DEFAULT_USER_AGENT
|
assert NORMAL_WARC2ZIM_EXIT_CODE
|
||||||
|
Loading…
x
Reference in New Issue
Block a user