Merge pull request #373 from openzim/stream_dl

Stream files downloads to not exhaust memory
This commit is contained in:
benoit74 2024-08-12 22:23:17 +02:00 committed by GitHub
commit d814c23178
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 37 additions and 16 deletions

View File

@ -11,6 +11,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add support for uncompressed tar archive in --warcs (#369)
### Fixed
- Stream files downloads to not exhaust memory (#373)
## [2.1.0] - 2024-08-09
### Added

10
src/zimit/constants.py Normal file
View File

@ -0,0 +1,10 @@
import logging
from zimscraperlib.logging import getLogger
EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
EXIT_CODE_CRAWLER_LIMIT_HIT = 11
NORMAL_WARC2ZIM_EXIT_CODE = 100
REQUESTS_TIMEOUT = 10
logger = getLogger(name="zimit", level=logging.INFO)

14
src/zimit/utils.py Normal file
View File

@ -0,0 +1,14 @@
from pathlib import Path
import requests
from zimit.constants import REQUESTS_TIMEOUT
def download_file(url: str, fpath: Path):
"""Download file from url to fpath with streaming"""
with requests.get(url, timeout=REQUESTS_TIMEOUT, stream=True) as resp:
resp.raise_for_status()
with open(fpath, "wb") as f:
for chunk in resp.iter_content(chunk_size=8192):
f.write(chunk)

View File

@ -6,7 +6,6 @@ and then calls the Node based driver
import atexit
import json
import logging
import re
import shutil
import signal
@ -21,19 +20,17 @@ from pathlib import Path
import inotify
import inotify.adapters
import requests
from warc2zim.main import main as warc2zim
from zimscraperlib.logging import getLogger
from zimscraperlib.uri import rebuild_uri
from zimit.__about__ import __version__
EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
EXIT_CODE_CRAWLER_LIMIT_HIT = 11
NORMAL_WARC2ZIM_EXIT_CODE = 100
REQUESTS_TIMEOUT = 10
logger = getLogger(name="zimit", level=logging.INFO)
from zimit.constants import (
EXIT_CODE_CRAWLER_LIMIT_HIT,
EXIT_CODE_WARC2ZIM_CHECK_FAILED,
NORMAL_WARC2ZIM_EXIT_CODE,
logger,
)
from zimit.utils import download_file
class ProgressFileWatcher:
@ -457,9 +454,7 @@ def run(raw_args):
f"Downloading browser profile from {custom_behavior} "
f"to {behaviors_file.name}"
)
resp = requests.get(custom_behavior, timeout=REQUESTS_TIMEOUT)
resp.raise_for_status()
Path(behaviors_file.name).write_bytes(resp.content)
download_file(custom_behavior, Path(behaviors_file.name))
else:
logger.info(
f"Copying browser profile from {custom_behavior} "
@ -552,9 +547,7 @@ def run(raw_args):
# collisions
warc_file = Path(filename.name)
logger.info(f"Downloading WARC(s) from {warc_location} to {warc_file}")
resp = requests.get(warc_location, timeout=REQUESTS_TIMEOUT)
resp.raise_for_status()
warc_file.write_bytes(resp.content)
download_file(warc_location, warc_file)
# if it is a plain warc or warc.gz, simply add it to the list
if suffix in {".warc", ".warc.gz"}: