Merge pull request #373 from openzim/stream_dl

Stream files downloads to not exhaust memory
This commit is contained in:
benoit74 2024-08-12 22:23:17 +02:00 committed by GitHub
commit d814c23178
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 37 additions and 16 deletions

View File

@ -11,6 +11,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add support for uncompressed tar archive in --warcs (#369) - Add support for uncompressed tar archive in --warcs (#369)
### Fixed
- Stream files downloads to not exhaust memory (#373)
## [2.1.0] - 2024-08-09 ## [2.1.0] - 2024-08-09
### Added ### Added

10
src/zimit/constants.py Normal file
View File

@ -0,0 +1,10 @@
import logging
from zimscraperlib.logging import getLogger
EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
EXIT_CODE_CRAWLER_LIMIT_HIT = 11
NORMAL_WARC2ZIM_EXIT_CODE = 100
REQUESTS_TIMEOUT = 10
logger = getLogger(name="zimit", level=logging.INFO)

14
src/zimit/utils.py Normal file
View File

@ -0,0 +1,14 @@
from pathlib import Path
import requests
from zimit.constants import REQUESTS_TIMEOUT
def download_file(url: str, fpath: Path):
"""Download file from url to fpath with streaming"""
with requests.get(url, timeout=REQUESTS_TIMEOUT, stream=True) as resp:
resp.raise_for_status()
with open(fpath, "wb") as f:
for chunk in resp.iter_content(chunk_size=8192):
f.write(chunk)

View File

@ -6,7 +6,6 @@ and then calls the Node based driver
import atexit import atexit
import json import json
import logging
import re import re
import shutil import shutil
import signal import signal
@ -21,19 +20,17 @@ from pathlib import Path
import inotify import inotify
import inotify.adapters import inotify.adapters
import requests
from warc2zim.main import main as warc2zim from warc2zim.main import main as warc2zim
from zimscraperlib.logging import getLogger
from zimscraperlib.uri import rebuild_uri from zimscraperlib.uri import rebuild_uri
from zimit.__about__ import __version__ from zimit.__about__ import __version__
from zimit.constants import (
EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2 EXIT_CODE_CRAWLER_LIMIT_HIT,
EXIT_CODE_CRAWLER_LIMIT_HIT = 11 EXIT_CODE_WARC2ZIM_CHECK_FAILED,
NORMAL_WARC2ZIM_EXIT_CODE = 100 NORMAL_WARC2ZIM_EXIT_CODE,
REQUESTS_TIMEOUT = 10 logger,
)
logger = getLogger(name="zimit", level=logging.INFO) from zimit.utils import download_file
class ProgressFileWatcher: class ProgressFileWatcher:
@ -457,9 +454,7 @@ def run(raw_args):
f"Downloading browser profile from {custom_behavior} " f"Downloading browser profile from {custom_behavior} "
f"to {behaviors_file.name}" f"to {behaviors_file.name}"
) )
resp = requests.get(custom_behavior, timeout=REQUESTS_TIMEOUT) download_file(custom_behavior, Path(behaviors_file.name))
resp.raise_for_status()
Path(behaviors_file.name).write_bytes(resp.content)
else: else:
logger.info( logger.info(
f"Copying browser profile from {custom_behavior} " f"Copying browser profile from {custom_behavior} "
@ -552,9 +547,7 @@ def run(raw_args):
# collisions # collisions
warc_file = Path(filename.name) warc_file = Path(filename.name)
logger.info(f"Downloading WARC(s) from {warc_location} to {warc_file}") logger.info(f"Downloading WARC(s) from {warc_location} to {warc_file}")
resp = requests.get(warc_location, timeout=REQUESTS_TIMEOUT) download_file(warc_location, warc_file)
resp.raise_for_status()
warc_file.write_bytes(resp.content)
# if it is a plain warc or warc.gz, simply add it to the list # if it is a plain warc or warc.gz, simply add it to the list
if suffix in {".warc", ".warc.gz"}: if suffix in {".warc", ".warc.gz"}: