mirror of
https://github.com/openzim/zimit.git
synced 2025-09-22 11:22:23 -04:00
Merge pull request #373 from openzim/stream_dl
Stream files downloads to not exhaust memory
This commit is contained in:
commit
d814c23178
@ -11,6 +11,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
|
|
||||||
- Add support for uncompressed tar archive in --warcs (#369)
|
- Add support for uncompressed tar archive in --warcs (#369)
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
|
||||||
|
- Stream files downloads to not exhaust memory (#373)
|
||||||
|
|
||||||
## [2.1.0] - 2024-08-09
|
## [2.1.0] - 2024-08-09
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
10
src/zimit/constants.py
Normal file
10
src/zimit/constants.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
import logging
|
||||||
|
|
||||||
|
from zimscraperlib.logging import getLogger
|
||||||
|
|
||||||
|
EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
|
||||||
|
EXIT_CODE_CRAWLER_LIMIT_HIT = 11
|
||||||
|
NORMAL_WARC2ZIM_EXIT_CODE = 100
|
||||||
|
REQUESTS_TIMEOUT = 10
|
||||||
|
|
||||||
|
logger = getLogger(name="zimit", level=logging.INFO)
|
14
src/zimit/utils.py
Normal file
14
src/zimit/utils.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from zimit.constants import REQUESTS_TIMEOUT
|
||||||
|
|
||||||
|
|
||||||
|
def download_file(url: str, fpath: Path):
|
||||||
|
"""Download file from url to fpath with streaming"""
|
||||||
|
with requests.get(url, timeout=REQUESTS_TIMEOUT, stream=True) as resp:
|
||||||
|
resp.raise_for_status()
|
||||||
|
with open(fpath, "wb") as f:
|
||||||
|
for chunk in resp.iter_content(chunk_size=8192):
|
||||||
|
f.write(chunk)
|
@ -6,7 +6,6 @@ and then calls the Node based driver
|
|||||||
|
|
||||||
import atexit
|
import atexit
|
||||||
import json
|
import json
|
||||||
import logging
|
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import signal
|
import signal
|
||||||
@ -21,19 +20,17 @@ from pathlib import Path
|
|||||||
|
|
||||||
import inotify
|
import inotify
|
||||||
import inotify.adapters
|
import inotify.adapters
|
||||||
import requests
|
|
||||||
from warc2zim.main import main as warc2zim
|
from warc2zim.main import main as warc2zim
|
||||||
from zimscraperlib.logging import getLogger
|
|
||||||
from zimscraperlib.uri import rebuild_uri
|
from zimscraperlib.uri import rebuild_uri
|
||||||
|
|
||||||
from zimit.__about__ import __version__
|
from zimit.__about__ import __version__
|
||||||
|
from zimit.constants import (
|
||||||
EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
|
EXIT_CODE_CRAWLER_LIMIT_HIT,
|
||||||
EXIT_CODE_CRAWLER_LIMIT_HIT = 11
|
EXIT_CODE_WARC2ZIM_CHECK_FAILED,
|
||||||
NORMAL_WARC2ZIM_EXIT_CODE = 100
|
NORMAL_WARC2ZIM_EXIT_CODE,
|
||||||
REQUESTS_TIMEOUT = 10
|
logger,
|
||||||
|
)
|
||||||
logger = getLogger(name="zimit", level=logging.INFO)
|
from zimit.utils import download_file
|
||||||
|
|
||||||
|
|
||||||
class ProgressFileWatcher:
|
class ProgressFileWatcher:
|
||||||
@ -457,9 +454,7 @@ def run(raw_args):
|
|||||||
f"Downloading browser profile from {custom_behavior} "
|
f"Downloading browser profile from {custom_behavior} "
|
||||||
f"to {behaviors_file.name}"
|
f"to {behaviors_file.name}"
|
||||||
)
|
)
|
||||||
resp = requests.get(custom_behavior, timeout=REQUESTS_TIMEOUT)
|
download_file(custom_behavior, Path(behaviors_file.name))
|
||||||
resp.raise_for_status()
|
|
||||||
Path(behaviors_file.name).write_bytes(resp.content)
|
|
||||||
else:
|
else:
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Copying browser profile from {custom_behavior} "
|
f"Copying browser profile from {custom_behavior} "
|
||||||
@ -552,9 +547,7 @@ def run(raw_args):
|
|||||||
# collisions
|
# collisions
|
||||||
warc_file = Path(filename.name)
|
warc_file = Path(filename.name)
|
||||||
logger.info(f"Downloading WARC(s) from {warc_location} to {warc_file}")
|
logger.info(f"Downloading WARC(s) from {warc_location} to {warc_file}")
|
||||||
resp = requests.get(warc_location, timeout=REQUESTS_TIMEOUT)
|
download_file(warc_location, warc_file)
|
||||||
resp.raise_for_status()
|
|
||||||
warc_file.write_bytes(resp.content)
|
|
||||||
|
|
||||||
# if it is a plain warc or warc.gz, simply add it to the list
|
# if it is a plain warc or warc.gz, simply add it to the list
|
||||||
if suffix in {".warc", ".warc.gz"}:
|
if suffix in {".warc", ".warc.gz"}:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user