mirror of
https://github.com/openzim/zimit.git
synced 2025-09-22 11:22:23 -04:00
commit
7cb118eaeb
2
.github/workflows/ci.yaml
vendored
2
.github/workflows/ci.yaml
vendored
@ -17,4 +17,4 @@ jobs:
|
||||
run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
|
||||
|
||||
- name: run integration test suite
|
||||
run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "pip install pytest; pytest -v ./integration.py"
|
||||
run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v ./integration.py"
|
||||
|
@ -12,14 +12,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
- `--title` to set ZIM title
|
||||
- `--description` to set ZIM description
|
||||
- New crawler options: `--maxPageLimit`, `--delay`, `--diskUtilization`
|
||||
- `--zim-lang` param to set warc2zim's `--lang` (ISO-639-3)
|
||||
|
||||
### Changed
|
||||
|
||||
- Using browsertrix-crawler 0.10.0-beta.4
|
||||
- Using browsertrix-crawler 0.10.2
|
||||
- Default and accepted values for `--waitUntil` from crawler's update
|
||||
- Using `main` warc2zim ⚠️ change before releasing!
|
||||
- Disabled Chrome updates to prevent incidental inclusion of update data in WARC/ZIM (#172)
|
||||
- `--failOnFailedSeed` used inconditionally
|
||||
- `--lang` now passed to crawler (ISO-639-1)
|
||||
|
||||
### Removed
|
||||
|
||||
|
44
Dockerfile
44
Dockerfile
@ -1,33 +1,35 @@
|
||||
FROM webrecorder/browsertrix-crawler:0.10.0-beta.4
|
||||
FROM webrecorder/browsertrix-crawler:0.10.2
|
||||
LABEL org.opencontainers.image.source https://github.com/openzim/zimit
|
||||
|
||||
RUN apt-get update && apt-get install -qqy --no-install-recommends libmagic1 && apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# temp (needs warc2zim release on zimit release)
|
||||
RUN pip3 install --no-cache-dir 'requests>=2.24.0' 'inotify==0.2.10' 'tld>=0.12,<0.13' 'warc2zim==1.5.1' && \
|
||||
pip3 uninstall -y warc2zim && \
|
||||
pip3 install 'git+https://github.com/openzim/warc2zim@main#egg_name=warc2zim'
|
||||
|
||||
RUN mkdir -p /output
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# download list of bad domains to filter-out. intentionnaly ran post-install
|
||||
# so it's not cached in earlier layers (url stays same but content updated)
|
||||
RUN mkdir -p /tmp/ads && cd /tmp/ads && \
|
||||
RUN apt-get update \
|
||||
&& apt-get install -qqy --no-install-recommends \
|
||||
libmagic1 \
|
||||
python3.10-venv \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
# python setup (in venv not to conflict with browsertrix)
|
||||
&& python3 -m venv /app/zimit \
|
||||
&& /app/zimit/bin/python -m pip install --no-cache-dir 'requests==2.31.0' 'inotify==0.2.10' 'tld==0.13' 'warc2zim==1.5.2' \
|
||||
# placeholder (default output location)
|
||||
&& mkdir -p /output \
|
||||
# disable chrome upgrade
|
||||
&& printf "repo_add_once=\"false\"\nrepo_reenable_on_distupgrade=\"false\"\n" > /etc/default/google-chrome \
|
||||
# download list of bad domains to filter-out. intentionnaly ran post-install \
|
||||
# so it's not cached in earlier layers (url stays same but content updated) \
|
||||
mkdir -p /tmp/ads && cd /tmp/ads && \
|
||||
curl -L -O https://hosts.anudeep.me/mirror/adservers.txt && \
|
||||
curl -L -O https://hosts.anudeep.me/mirror/CoinMiner.txt && \
|
||||
curl -L -O https://hosts.anudeep.me/mirror/facebook.txt && \
|
||||
cat ./*.txt > /etc/blocklist.txt \
|
||||
&& rm ./*.txt
|
||||
RUN printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh && \
|
||||
&& rm ./*.txt \
|
||||
&& printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh && \
|
||||
chmod +x /usr/local/bin/entrypoint.sh
|
||||
|
||||
WORKDIR /app
|
||||
ADD zimit.py /app/
|
||||
|
||||
RUN ln -s /app/zimit.py /usr/bin/zimit
|
||||
|
||||
RUN printf "repo_add_once=\"false\"\nrepo_reenable_on_distupgrade=\"false\"\n" > /etc/default/google-chrome
|
||||
# fix shebang on zimit to use in-venv python
|
||||
RUN sed -i.bak "1 s/.*/#!\/app\/zimit\/bin\/python3/" /app/zimit.py \
|
||||
&& ln -s /app/zimit.py /usr/bin/zimit \
|
||||
&& chmod +x /usr/bin/zimit
|
||||
|
||||
ENTRYPOINT ["entrypoint.sh"]
|
||||
CMD ["zimit"]
|
||||
|
17
zimit.py
17
zimit.py
@ -205,6 +205,18 @@ def zimit(args=None):
|
||||
action="store_true",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--lang",
|
||||
help="if set, sets the language used by the browser, should be ISO 639 language[-country] code",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--zim-lang",
|
||||
help="Language metadata of ZIM "
|
||||
"(warc2zim --lang param). ISO-639-3 code. "
|
||||
"Retrieved from homepage if found, fallback to `eng`",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--mobileDevice",
|
||||
help="Emulate mobile device by name from "
|
||||
@ -348,6 +360,10 @@ def zimit(args=None):
|
||||
warc2zim_args.append("--description")
|
||||
warc2zim_args.append(zimit_args.description)
|
||||
|
||||
if zimit_args.zim_lang:
|
||||
warc2zim_args.append("--lang")
|
||||
warc2zim_args.append(zimit_args.zim_lang)
|
||||
|
||||
print("----------")
|
||||
print("Testing warc2zim args")
|
||||
print("Running: warc2zim " + " ".join(warc2zim_args), flush=True)
|
||||
@ -482,6 +498,7 @@ def get_node_cmd_line(args):
|
||||
"exclude",
|
||||
"collection",
|
||||
"allowHashUrls",
|
||||
"lang",
|
||||
"mobileDevice",
|
||||
"userAgent",
|
||||
"useSitemap",
|
||||
|
Loading…
x
Reference in New Issue
Block a user