From 4e04645e6b5bf4a75b563c146557523d23c8be31 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 19 Sep 2020 22:47:19 +0000 Subject: [PATCH] move warc2zim to be launched by node process --- README.md | 5 ++++- index.js | 12 ++++++++++++ run.sh | 21 +-------------------- 3 files changed, 17 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 1aa0fbe..4c47f89 100644 --- a/README.md +++ b/README.md @@ -24,12 +24,15 @@ The image accepts the following parameters: - "" - the url to be crawled (required) - `--workers N` - number of crawl workers to be run in parallel - `--wait-until` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example). +- `--name` - Name of ZIM file (defaults to the hostname of the URL) +- `--output` - output directory (defaults to `/output`) + Example command: ``` -docker run -d -e NAME=myzimfile -v /output:/output --cap-add=SYS_ADMIN --cap-add=NET_ADMIN --shm-size=1gb openzim/zimit "" --workers 2 --wait-until domcontentloaded +docker run -v /output:/output --cap-add=SYS_ADMIN --cap-add=NET_ADMIN --shm-size=1gb openzim/zimit "" --name myzimfile --workers 2 --wait-until domcontentloaded ```
diff --git a/index.js b/index.js index 305c7e0..a49beeb 100644 --- a/index.js +++ b/index.js @@ -1,5 +1,6 @@ const puppeteer = require("puppeteer-core"); const { Cluster } = require("puppeteer-cluster"); +const child_process = require("child_process"); async function run(params) { const args = [ @@ -66,6 +67,17 @@ async function run(params) { await cluster.idle(); await cluster.close(); + + const zimName = params.name || new URL(url).hostname; + const zimOutput = params.output || "/output"; + + const warc2zim = `warc2zim --url ${url} --name ${zimName} --output ${zimOutput} ./collections/capture/archive/\*.warc.gz`; + + console.log("Running: " + warc2zim); + + //await new Promise((resolve) => { + child_process.execSync(warc2zim, {shell: "/bin/bash", stdio: "inherit", stderr: "inherit"}); + //}); } diff --git a/run.sh b/run.sh index e47d0ea..b9bd9fc 100755 --- a/run.sh +++ b/run.sh @@ -4,26 +4,7 @@ URL="$1" wb-manager init capture uwsgi uwsgi.ini &> /dev/null & -#/browser/browser_entrypoint.sh /browser/run.sh & -#if [[ -n "$PROXY_CA_FILE" && -f "$PROXY_CA_FILE" && -n "$PROXY_HOST" ]]; then -# rm -rf "$HOME/.pki/nssdb" -# mkdir -p "$HOME/.pki/nssdb" -# certutil -d "$HOME/.pki/nssdb" -N -# certutil -d "sql:$HOME/.pki/nssdb" -A -t "C,," -n "Proxy" -i "$PROXY_CA_FILE" -# rm "$PROXY_CA_FILE" -#fi - -#mkdir ~/.config/ -#mkdir ~/.config/google-chrome -#touch ~/.config/google-chrome/First\ Run - +# needed for chrome export QT_X11_NO_MITSHM=1 node index.js "$@" - -NAME=${NAME:=zimfile} - - -warc2zim --url $URL --name $NAME --output=/output ./collections/capture/archive/*.warc.gz - -