mirror of
https://github.com/openzim/zimit.git
synced 2025-09-22 03:12:04 -04:00
Merge pull request #373 from openzim/stream_dl
Stream files downloads to not exhaust memory
This commit is contained in:
commit
d814c23178
@ -11,6 +11,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
- Add support for uncompressed tar archive in --warcs (#369)
|
||||
|
||||
### Fixed
|
||||
|
||||
- Stream files downloads to not exhaust memory (#373)
|
||||
|
||||
## [2.1.0] - 2024-08-09
|
||||
|
||||
### Added
|
||||
|
10
src/zimit/constants.py
Normal file
10
src/zimit/constants.py
Normal file
@ -0,0 +1,10 @@
|
||||
import logging
|
||||
|
||||
from zimscraperlib.logging import getLogger
|
||||
|
||||
EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
|
||||
EXIT_CODE_CRAWLER_LIMIT_HIT = 11
|
||||
NORMAL_WARC2ZIM_EXIT_CODE = 100
|
||||
REQUESTS_TIMEOUT = 10
|
||||
|
||||
logger = getLogger(name="zimit", level=logging.INFO)
|
14
src/zimit/utils.py
Normal file
14
src/zimit/utils.py
Normal file
@ -0,0 +1,14 @@
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
from zimit.constants import REQUESTS_TIMEOUT
|
||||
|
||||
|
||||
def download_file(url: str, fpath: Path):
|
||||
"""Download file from url to fpath with streaming"""
|
||||
with requests.get(url, timeout=REQUESTS_TIMEOUT, stream=True) as resp:
|
||||
resp.raise_for_status()
|
||||
with open(fpath, "wb") as f:
|
||||
for chunk in resp.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
@ -6,7 +6,6 @@ and then calls the Node based driver
|
||||
|
||||
import atexit
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import shutil
|
||||
import signal
|
||||
@ -21,19 +20,17 @@ from pathlib import Path
|
||||
|
||||
import inotify
|
||||
import inotify.adapters
|
||||
import requests
|
||||
from warc2zim.main import main as warc2zim
|
||||
from zimscraperlib.logging import getLogger
|
||||
from zimscraperlib.uri import rebuild_uri
|
||||
|
||||
from zimit.__about__ import __version__
|
||||
|
||||
EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
|
||||
EXIT_CODE_CRAWLER_LIMIT_HIT = 11
|
||||
NORMAL_WARC2ZIM_EXIT_CODE = 100
|
||||
REQUESTS_TIMEOUT = 10
|
||||
|
||||
logger = getLogger(name="zimit", level=logging.INFO)
|
||||
from zimit.constants import (
|
||||
EXIT_CODE_CRAWLER_LIMIT_HIT,
|
||||
EXIT_CODE_WARC2ZIM_CHECK_FAILED,
|
||||
NORMAL_WARC2ZIM_EXIT_CODE,
|
||||
logger,
|
||||
)
|
||||
from zimit.utils import download_file
|
||||
|
||||
|
||||
class ProgressFileWatcher:
|
||||
@ -457,9 +454,7 @@ def run(raw_args):
|
||||
f"Downloading browser profile from {custom_behavior} "
|
||||
f"to {behaviors_file.name}"
|
||||
)
|
||||
resp = requests.get(custom_behavior, timeout=REQUESTS_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
Path(behaviors_file.name).write_bytes(resp.content)
|
||||
download_file(custom_behavior, Path(behaviors_file.name))
|
||||
else:
|
||||
logger.info(
|
||||
f"Copying browser profile from {custom_behavior} "
|
||||
@ -552,9 +547,7 @@ def run(raw_args):
|
||||
# collisions
|
||||
warc_file = Path(filename.name)
|
||||
logger.info(f"Downloading WARC(s) from {warc_location} to {warc_file}")
|
||||
resp = requests.get(warc_location, timeout=REQUESTS_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
warc_file.write_bytes(resp.content)
|
||||
download_file(warc_location, warc_file)
|
||||
|
||||
# if it is a plain warc or warc.gz, simply add it to the list
|
||||
if suffix in {".warc", ".warc.gz"}:
|
||||
|
Loading…
x
Reference in New Issue
Block a user