From daa2492655d2a6d5e57fb223a039a5f8a359bf7d Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 6 Oct 2020 06:25:40 +0000 Subject: [PATCH] config work: pass remaining config opts to warc2zim, fixes #13 warc2zim check: add runWarc2Zim() to test warc2zim opts before running for validity run script: create temp dir in output dir to ensure all data is on the volume run script: add --keep option to keep temp dir, otherwise delete --- Dockerfile | 2 +- index.js | 46 ++++++++++++++++++++++++++++++++-------------- run.sh | 37 ++++++++++++++++++++++++++++++++++--- uwsgi.ini | 2 +- 4 files changed, 68 insertions(+), 19 deletions(-) diff --git a/Dockerfile b/Dockerfile index 0d66ed8..20cc680 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,7 +12,7 @@ RUN pip install pywb uwsgi # force reinstall of gevent to prevent segfault on uwsgi worker RUN pip install -U gevent -RUN pip install warc2zim==1.0.1 +RUN pip install git+https://github.com/openzim/warc2zim@check-args-no-inputs#foo COPY --from=chrome /usr/lib/x86_64-linux-gnu/ /usr/lib/x86_64-linux-gnu/ COPY --from=chrome /lib/x86_64-linux-gnu/libdbus* /lib/x86_64-linux-gnu/ diff --git a/index.js b/index.js index 83c0c91..b45482a 100644 --- a/index.js +++ b/index.js @@ -43,7 +43,7 @@ async function run(params) { // params const { url, waitUntil, timeout, scope, limit, exclude, scroll } = params; - console.log("Limit: " + limit); + //console.log("Limit: " + limit); // links crawled counter let numLinks = 0; @@ -103,17 +103,6 @@ async function run(params) { await cluster.idle(); await cluster.close(); - - const zimName = params.name || new URL(url).hostname; - const zimOutput = params.output || "/output"; - - const warc2zim = `warc2zim -a --url ${url} --name ${zimName} --output ${zimOutput} ./collections/capture/archive/\*.warc.gz`; - - console.log("Running: " + warc2zim); - - //await new Promise((resolve) => { - child_process.execSync(warc2zim, {shell: "/bin/bash", stdio: "inherit", stderr: "inherit"}); - //}); } @@ -221,7 +210,7 @@ function sleep(time) { async function main() { const params = require('yargs') - .usage("zimit [options]") + .usage("zimit [options] [warc2zim options]") .options({ "url": { alias: "u", @@ -299,10 +288,11 @@ async function main() { }) .argv; - console.log("params", params); + runWarc2Zim(params, true); try { await run(params); + runWarc2Zim(params, false); process.exit(0); } catch(e) { console.error("Crawl failed, ZIM creation skipped"); @@ -311,6 +301,34 @@ async function main() { } } +function runWarc2Zim(params, checkOnly = true) { + const OPTS = ["_", "$0", "keep", "workers", "w", "waitUntil", "wait-until", "limit", "timeout", "scope", "exclude", "scroll"]; + + let zimOptsStr = ""; + + for (const key of Object.keys(params)) { + if (!OPTS.includes(key)) { + zimOptsStr += `--${key} ${params[key]} `; + } + } + + const warc2zimCmd = "warc2zim " + zimOptsStr + (checkOnly ? "" : " ./collections/capture/archive/\*.warc.gz"); + + console.log("Running: " + warc2zimCmd); + + const {status} = child_process.spawnSync(warc2zimCmd, {shell: "/bin/bash", stdio: "inherit", stderr: "inherit"}); + + if (status && !(checkOnly && status === 100)) { + console.error("Invalid warc2zim params, warc2zim exited with: " + status); + process.exit(status); + } +} + + + + + + main(); diff --git a/run.sh b/run.sh index 74d0752..cb27ad5 100755 --- a/run.sh +++ b/run.sh @@ -1,12 +1,43 @@ #!/bin/bash + +output_dir="/output" + +for val in "$@" +do + if [[ $val == "-o" ]] || [[ $val == "--output" ]]; then + output_dir="$2" + + elif [[ $val == "--keep" ]]; then + keep="1" + fi +done + +cmd="$@" + +curr=$(pwd) + +tmpdir=$(mktemp -d --tmpdir=$output_dir) + +chmod a+rx $tmpdir + +echo "output_dir: $tmpdir" + +pushd $tmpdir + wb-manager init capture -uwsgi uwsgi.ini &> /dev/null & +uwsgi $curr/uwsgi.ini &> /dev/null & # needed for chrome export QT_X11_NO_MITSHM=1 -cmd="$@" +su zimit -c "node $curr/index.js $cmd" -su zimit -c "node index.js $cmd" +popd +# if not keeping, delete temp dir +if [[ -z $keep ]]; then + echo "Removing temp dir $tmpdir" + rm -rf $tmpdir +fi + diff --git a/uwsgi.ini b/uwsgi.ini index 1ded14e..0f46569 100644 --- a/uwsgi.ini +++ b/uwsgi.ini @@ -22,6 +22,6 @@ env = GEVENT_MONKEY_PATCH=1 processes = 8 # specify config file here -env = PYWB_CONFIG_FILE=config.yaml +env = PYWB_CONFIG_FILE=/app/config.yaml wsgi = pywb.apps.wayback