mirror of
https://github.com/openzim/zimit.git
synced 2025-09-22 19:38:32 -04:00
commit
7cb118eaeb
2
.github/workflows/ci.yaml
vendored
2
.github/workflows/ci.yaml
vendored
@ -17,4 +17,4 @@ jobs:
|
|||||||
run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
|
run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
|
||||||
|
|
||||||
- name: run integration test suite
|
- name: run integration test suite
|
||||||
run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "pip install pytest; pytest -v ./integration.py"
|
run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v ./integration.py"
|
||||||
|
@ -12,14 +12,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
- `--title` to set ZIM title
|
- `--title` to set ZIM title
|
||||||
- `--description` to set ZIM description
|
- `--description` to set ZIM description
|
||||||
- New crawler options: `--maxPageLimit`, `--delay`, `--diskUtilization`
|
- New crawler options: `--maxPageLimit`, `--delay`, `--diskUtilization`
|
||||||
|
- `--zim-lang` param to set warc2zim's `--lang` (ISO-639-3)
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
|
|
||||||
- Using browsertrix-crawler 0.10.0-beta.4
|
- Using browsertrix-crawler 0.10.2
|
||||||
- Default and accepted values for `--waitUntil` from crawler's update
|
- Default and accepted values for `--waitUntil` from crawler's update
|
||||||
- Using `main` warc2zim ⚠️ change before releasing!
|
- Using `main` warc2zim ⚠️ change before releasing!
|
||||||
- Disabled Chrome updates to prevent incidental inclusion of update data in WARC/ZIM (#172)
|
- Disabled Chrome updates to prevent incidental inclusion of update data in WARC/ZIM (#172)
|
||||||
- `--failOnFailedSeed` used inconditionally
|
- `--failOnFailedSeed` used inconditionally
|
||||||
|
- `--lang` now passed to crawler (ISO-639-1)
|
||||||
|
|
||||||
### Removed
|
### Removed
|
||||||
|
|
||||||
|
44
Dockerfile
44
Dockerfile
@ -1,33 +1,35 @@
|
|||||||
FROM webrecorder/browsertrix-crawler:0.10.0-beta.4
|
FROM webrecorder/browsertrix-crawler:0.10.2
|
||||||
LABEL org.opencontainers.image.source https://github.com/openzim/zimit
|
LABEL org.opencontainers.image.source https://github.com/openzim/zimit
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -qqy --no-install-recommends libmagic1 && apt-get clean && rm -rf /var/lib/apt/lists/*
|
RUN apt-get update \
|
||||||
|
&& apt-get install -qqy --no-install-recommends \
|
||||||
# temp (needs warc2zim release on zimit release)
|
libmagic1 \
|
||||||
RUN pip3 install --no-cache-dir 'requests>=2.24.0' 'inotify==0.2.10' 'tld>=0.12,<0.13' 'warc2zim==1.5.1' && \
|
python3.10-venv \
|
||||||
pip3 uninstall -y warc2zim && \
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
pip3 install 'git+https://github.com/openzim/warc2zim@main#egg_name=warc2zim'
|
# python setup (in venv not to conflict with browsertrix)
|
||||||
|
&& python3 -m venv /app/zimit \
|
||||||
RUN mkdir -p /output
|
&& /app/zimit/bin/python -m pip install --no-cache-dir 'requests==2.31.0' 'inotify==0.2.10' 'tld==0.13' 'warc2zim==1.5.2' \
|
||||||
|
# placeholder (default output location)
|
||||||
WORKDIR /app
|
&& mkdir -p /output \
|
||||||
|
# disable chrome upgrade
|
||||||
# download list of bad domains to filter-out. intentionnaly ran post-install
|
&& printf "repo_add_once=\"false\"\nrepo_reenable_on_distupgrade=\"false\"\n" > /etc/default/google-chrome \
|
||||||
# so it's not cached in earlier layers (url stays same but content updated)
|
# download list of bad domains to filter-out. intentionnaly ran post-install \
|
||||||
RUN mkdir -p /tmp/ads && cd /tmp/ads && \
|
# so it's not cached in earlier layers (url stays same but content updated) \
|
||||||
|
mkdir -p /tmp/ads && cd /tmp/ads && \
|
||||||
curl -L -O https://hosts.anudeep.me/mirror/adservers.txt && \
|
curl -L -O https://hosts.anudeep.me/mirror/adservers.txt && \
|
||||||
curl -L -O https://hosts.anudeep.me/mirror/CoinMiner.txt && \
|
curl -L -O https://hosts.anudeep.me/mirror/CoinMiner.txt && \
|
||||||
curl -L -O https://hosts.anudeep.me/mirror/facebook.txt && \
|
curl -L -O https://hosts.anudeep.me/mirror/facebook.txt && \
|
||||||
cat ./*.txt > /etc/blocklist.txt \
|
cat ./*.txt > /etc/blocklist.txt \
|
||||||
&& rm ./*.txt
|
&& rm ./*.txt \
|
||||||
RUN printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh && \
|
&& printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh && \
|
||||||
chmod +x /usr/local/bin/entrypoint.sh
|
chmod +x /usr/local/bin/entrypoint.sh
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
ADD zimit.py /app/
|
ADD zimit.py /app/
|
||||||
|
# fix shebang on zimit to use in-venv python
|
||||||
RUN ln -s /app/zimit.py /usr/bin/zimit
|
RUN sed -i.bak "1 s/.*/#!\/app\/zimit\/bin\/python3/" /app/zimit.py \
|
||||||
|
&& ln -s /app/zimit.py /usr/bin/zimit \
|
||||||
RUN printf "repo_add_once=\"false\"\nrepo_reenable_on_distupgrade=\"false\"\n" > /etc/default/google-chrome
|
&& chmod +x /usr/bin/zimit
|
||||||
|
|
||||||
ENTRYPOINT ["entrypoint.sh"]
|
ENTRYPOINT ["entrypoint.sh"]
|
||||||
CMD ["zimit"]
|
CMD ["zimit"]
|
||||||
|
17
zimit.py
17
zimit.py
@ -205,6 +205,18 @@ def zimit(args=None):
|
|||||||
action="store_true",
|
action="store_true",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--lang",
|
||||||
|
help="if set, sets the language used by the browser, should be ISO 639 language[-country] code",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--zim-lang",
|
||||||
|
help="Language metadata of ZIM "
|
||||||
|
"(warc2zim --lang param). ISO-639-3 code. "
|
||||||
|
"Retrieved from homepage if found, fallback to `eng`",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--mobileDevice",
|
"--mobileDevice",
|
||||||
help="Emulate mobile device by name from "
|
help="Emulate mobile device by name from "
|
||||||
@ -348,6 +360,10 @@ def zimit(args=None):
|
|||||||
warc2zim_args.append("--description")
|
warc2zim_args.append("--description")
|
||||||
warc2zim_args.append(zimit_args.description)
|
warc2zim_args.append(zimit_args.description)
|
||||||
|
|
||||||
|
if zimit_args.zim_lang:
|
||||||
|
warc2zim_args.append("--lang")
|
||||||
|
warc2zim_args.append(zimit_args.zim_lang)
|
||||||
|
|
||||||
print("----------")
|
print("----------")
|
||||||
print("Testing warc2zim args")
|
print("Testing warc2zim args")
|
||||||
print("Running: warc2zim " + " ".join(warc2zim_args), flush=True)
|
print("Running: warc2zim " + " ".join(warc2zim_args), flush=True)
|
||||||
@ -482,6 +498,7 @@ def get_node_cmd_line(args):
|
|||||||
"exclude",
|
"exclude",
|
||||||
"collection",
|
"collection",
|
||||||
"allowHashUrls",
|
"allowHashUrls",
|
||||||
|
"lang",
|
||||||
"mobileDevice",
|
"mobileDevice",
|
||||||
"userAgent",
|
"userAgent",
|
||||||
"useSitemap",
|
"useSitemap",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user