Merge pull request #373 from openzim/stream_dl

Stream files downloads to not exhaust memory
2025-09-22 11:22:23 -04:00 · 2024-08-12 22:23:17 +02:00 · 2024-08-12 22:23:17 +02:00 · d814c23178
commit d814c23178
parent d0d0c6e6e6 efdf7804c0
4 changed files with 37 additions and 16 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -11,6 +11,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Add support for uncompressed tar archive in --warcs (#369)
 ### Fixed
 - Stream files downloads to not exhaust memory (#373)
 ## [2.1.0] - 2024-08-09
 ### Added
--- a/src/zimit/constants.py
+++ b/src/zimit/constants.py
@ -0,0 +1,10 @@
 import logging
 from zimscraperlib.logging import getLogger
 EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
 EXIT_CODE_CRAWLER_LIMIT_HIT = 11
 NORMAL_WARC2ZIM_EXIT_CODE = 100
 REQUESTS_TIMEOUT = 10
 logger = getLogger(name="zimit", level=logging.INFO)
--- a/src/zimit/utils.py
+++ b/src/zimit/utils.py
@ -0,0 +1,14 @@
 from pathlib import Path
 import requests
 from zimit.constants import REQUESTS_TIMEOUT
 def download_file(url: str, fpath: Path):
    """Download file from url to fpath with streaming"""
    with requests.get(url, timeout=REQUESTS_TIMEOUT, stream=True) as resp:
        resp.raise_for_status()
        with open(fpath, "wb") as f:
            for chunk in resp.iter_content(chunk_size=8192):
                f.write(chunk)
--- a/src/zimit/zimit.py
+++ b/src/zimit/zimit.py
@ -6,7 +6,6 @@ and then calls the Node based driver
 import atexit
 import json
 import logging
 import re
 import shutil
 import signal
@ -21,19 +20,17 @@ from pathlib import Path
 import inotify
 import inotify.adapters
 import requests
 from warc2zim.main import main as warc2zim
 from zimscraperlib.logging import getLogger
 from zimscraperlib.uri import rebuild_uri
 from zimit.__about__ import __version__
-
+from zimit.constants import (
-EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
+    EXIT_CODE_CRAWLER_LIMIT_HIT,
-EXIT_CODE_CRAWLER_LIMIT_HIT = 11
+    EXIT_CODE_WARC2ZIM_CHECK_FAILED,
-NORMAL_WARC2ZIM_EXIT_CODE = 100
+    NORMAL_WARC2ZIM_EXIT_CODE,
-REQUESTS_TIMEOUT = 10
+    logger,
-
+)
-logger = getLogger(name="zimit", level=logging.INFO)
+from zimit.utils import download_file
 class ProgressFileWatcher:
@ -457,9 +454,7 @@ def run(raw_args):
                    f"Downloading browser profile from {custom_behavior} "
                    f"to {behaviors_file.name}"
                )
-                resp = requests.get(custom_behavior, timeout=REQUESTS_TIMEOUT)
+                download_file(custom_behavior, Path(behaviors_file.name))
                resp.raise_for_status()
                Path(behaviors_file.name).write_bytes(resp.content)
            else:
                logger.info(
                    f"Copying browser profile from {custom_behavior} "
@ -552,9 +547,7 @@ def run(raw_args):
            # collisions
            warc_file = Path(filename.name)
            logger.info(f"Downloading WARC(s) from {warc_location} to {warc_file}")
-            resp = requests.get(warc_location, timeout=REQUESTS_TIMEOUT)
+            download_file(warc_location, warc_file)
            resp.raise_for_status()
            warc_file.write_bytes(resp.content)
            # if it is a plain warc or warc.gz, simply add it to the list
            if suffix in {".warc", ".warc.gz"}: