Merge pull request #201 from openzim/lang

crawler 0.10.2
This commit is contained in:
rgaudin 2023-08-02 14:35:52 +00:00 committed by GitHub
commit 7cb118eaeb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 44 additions and 23 deletions

View File

@ -17,4 +17,4 @@ jobs:
run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
- name: run integration test suite - name: run integration test suite
run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "pip install pytest; pytest -v ./integration.py" run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v ./integration.py"

View File

@ -12,14 +12,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- `--title` to set ZIM title - `--title` to set ZIM title
- `--description` to set ZIM description - `--description` to set ZIM description
- New crawler options: `--maxPageLimit`, `--delay`, `--diskUtilization` - New crawler options: `--maxPageLimit`, `--delay`, `--diskUtilization`
- `--zim-lang` param to set warc2zim's `--lang` (ISO-639-3)
### Changed ### Changed
- Using browsertrix-crawler 0.10.0-beta.4 - Using browsertrix-crawler 0.10.2
- Default and accepted values for `--waitUntil` from crawler's update - Default and accepted values for `--waitUntil` from crawler's update
- Using `main` warc2zim ⚠️ change before releasing! - Using `main` warc2zim ⚠️ change before releasing!
- Disabled Chrome updates to prevent incidental inclusion of update data in WARC/ZIM (#172) - Disabled Chrome updates to prevent incidental inclusion of update data in WARC/ZIM (#172)
- `--failOnFailedSeed` used inconditionally - `--failOnFailedSeed` used inconditionally
- `--lang` now passed to crawler (ISO-639-1)
### Removed ### Removed

View File

@ -1,33 +1,35 @@
FROM webrecorder/browsertrix-crawler:0.10.0-beta.4 FROM webrecorder/browsertrix-crawler:0.10.2
LABEL org.opencontainers.image.source https://github.com/openzim/zimit LABEL org.opencontainers.image.source https://github.com/openzim/zimit
RUN apt-get update && apt-get install -qqy --no-install-recommends libmagic1 && apt-get clean && rm -rf /var/lib/apt/lists/* RUN apt-get update \
&& apt-get install -qqy --no-install-recommends \
# temp (needs warc2zim release on zimit release) libmagic1 \
RUN pip3 install --no-cache-dir 'requests>=2.24.0' 'inotify==0.2.10' 'tld>=0.12,<0.13' 'warc2zim==1.5.1' && \ python3.10-venv \
pip3 uninstall -y warc2zim && \ && rm -rf /var/lib/apt/lists/* \
pip3 install 'git+https://github.com/openzim/warc2zim@main#egg_name=warc2zim' # python setup (in venv not to conflict with browsertrix)
&& python3 -m venv /app/zimit \
RUN mkdir -p /output && /app/zimit/bin/python -m pip install --no-cache-dir 'requests==2.31.0' 'inotify==0.2.10' 'tld==0.13' 'warc2zim==1.5.2' \
# placeholder (default output location)
WORKDIR /app && mkdir -p /output \
# disable chrome upgrade
# download list of bad domains to filter-out. intentionnaly ran post-install && printf "repo_add_once=\"false\"\nrepo_reenable_on_distupgrade=\"false\"\n" > /etc/default/google-chrome \
# so it's not cached in earlier layers (url stays same but content updated) # download list of bad domains to filter-out. intentionnaly ran post-install \
RUN mkdir -p /tmp/ads && cd /tmp/ads && \ # so it's not cached in earlier layers (url stays same but content updated) \
mkdir -p /tmp/ads && cd /tmp/ads && \
curl -L -O https://hosts.anudeep.me/mirror/adservers.txt && \ curl -L -O https://hosts.anudeep.me/mirror/adservers.txt && \
curl -L -O https://hosts.anudeep.me/mirror/CoinMiner.txt && \ curl -L -O https://hosts.anudeep.me/mirror/CoinMiner.txt && \
curl -L -O https://hosts.anudeep.me/mirror/facebook.txt && \ curl -L -O https://hosts.anudeep.me/mirror/facebook.txt && \
cat ./*.txt > /etc/blocklist.txt \ cat ./*.txt > /etc/blocklist.txt \
&& rm ./*.txt && rm ./*.txt \
RUN printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh && \ && printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh && \
chmod +x /usr/local/bin/entrypoint.sh chmod +x /usr/local/bin/entrypoint.sh
WORKDIR /app
ADD zimit.py /app/ ADD zimit.py /app/
# fix shebang on zimit to use in-venv python
RUN ln -s /app/zimit.py /usr/bin/zimit RUN sed -i.bak "1 s/.*/#!\/app\/zimit\/bin\/python3/" /app/zimit.py \
&& ln -s /app/zimit.py /usr/bin/zimit \
RUN printf "repo_add_once=\"false\"\nrepo_reenable_on_distupgrade=\"false\"\n" > /etc/default/google-chrome && chmod +x /usr/bin/zimit
ENTRYPOINT ["entrypoint.sh"] ENTRYPOINT ["entrypoint.sh"]
CMD ["zimit"] CMD ["zimit"]

View File

@ -205,6 +205,18 @@ def zimit(args=None):
action="store_true", action="store_true",
) )
parser.add_argument(
"--lang",
help="if set, sets the language used by the browser, should be ISO 639 language[-country] code",
)
parser.add_argument(
"--zim-lang",
help="Language metadata of ZIM "
"(warc2zim --lang param). ISO-639-3 code. "
"Retrieved from homepage if found, fallback to `eng`",
)
parser.add_argument( parser.add_argument(
"--mobileDevice", "--mobileDevice",
help="Emulate mobile device by name from " help="Emulate mobile device by name from "
@ -348,6 +360,10 @@ def zimit(args=None):
warc2zim_args.append("--description") warc2zim_args.append("--description")
warc2zim_args.append(zimit_args.description) warc2zim_args.append(zimit_args.description)
if zimit_args.zim_lang:
warc2zim_args.append("--lang")
warc2zim_args.append(zimit_args.zim_lang)
print("----------") print("----------")
print("Testing warc2zim args") print("Testing warc2zim args")
print("Running: warc2zim " + " ".join(warc2zim_args), flush=True) print("Running: warc2zim " + " ".join(warc2zim_args), flush=True)
@ -482,6 +498,7 @@ def get_node_cmd_line(args):
"exclude", "exclude",
"collection", "collection",
"allowHashUrls", "allowHashUrls",
"lang",
"mobileDevice", "mobileDevice",
"userAgent", "userAgent",
"useSitemap", "useSitemap",