config work: pass remaining config opts to warc2zim, fixes #13

warc2zim check: add runWarc2Zim() to test warc2zim opts before running for validity
run script: create temp dir in output dir to ensure all data is on the volume
run script: add --keep option to keep temp dir, otherwise delete
This commit is contained in:
Ilya Kreymer 2020-10-06 06:25:40 +00:00
parent e4128c8183
commit daa2492655
4 changed files with 68 additions and 19 deletions

View File

@ -12,7 +12,7 @@ RUN pip install pywb uwsgi
# force reinstall of gevent to prevent segfault on uwsgi worker
RUN pip install -U gevent
RUN pip install warc2zim==1.0.1
RUN pip install git+https://github.com/openzim/warc2zim@check-args-no-inputs#foo
COPY --from=chrome /usr/lib/x86_64-linux-gnu/ /usr/lib/x86_64-linux-gnu/
COPY --from=chrome /lib/x86_64-linux-gnu/libdbus* /lib/x86_64-linux-gnu/

View File

@ -43,7 +43,7 @@ async function run(params) {
// params
const { url, waitUntil, timeout, scope, limit, exclude, scroll } = params;
console.log("Limit: " + limit);
//console.log("Limit: " + limit);
// links crawled counter
let numLinks = 0;
@ -103,17 +103,6 @@ async function run(params) {
await cluster.idle();
await cluster.close();
const zimName = params.name || new URL(url).hostname;
const zimOutput = params.output || "/output";
const warc2zim = `warc2zim -a --url ${url} --name ${zimName} --output ${zimOutput} ./collections/capture/archive/\*.warc.gz`;
console.log("Running: " + warc2zim);
//await new Promise((resolve) => {
child_process.execSync(warc2zim, {shell: "/bin/bash", stdio: "inherit", stderr: "inherit"});
//});
}
@ -221,7 +210,7 @@ function sleep(time) {
async function main() {
const params = require('yargs')
.usage("zimit <command> [options]")
.usage("zimit [options] [warc2zim options]")
.options({
"url": {
alias: "u",
@ -299,10 +288,11 @@ async function main() {
})
.argv;
console.log("params", params);
runWarc2Zim(params, true);
try {
await run(params);
runWarc2Zim(params, false);
process.exit(0);
} catch(e) {
console.error("Crawl failed, ZIM creation skipped");
@ -311,6 +301,34 @@ async function main() {
}
}
function runWarc2Zim(params, checkOnly = true) {
const OPTS = ["_", "$0", "keep", "workers", "w", "waitUntil", "wait-until", "limit", "timeout", "scope", "exclude", "scroll"];
let zimOptsStr = "";
for (const key of Object.keys(params)) {
if (!OPTS.includes(key)) {
zimOptsStr += `--${key} ${params[key]} `;
}
}
const warc2zimCmd = "warc2zim " + zimOptsStr + (checkOnly ? "" : " ./collections/capture/archive/\*.warc.gz");
console.log("Running: " + warc2zimCmd);
const {status} = child_process.spawnSync(warc2zimCmd, {shell: "/bin/bash", stdio: "inherit", stderr: "inherit"});
if (status && !(checkOnly && status === 100)) {
console.error("Invalid warc2zim params, warc2zim exited with: " + status);
process.exit(status);
}
}
main();

37
run.sh
View File

@ -1,12 +1,43 @@
#!/bin/bash
output_dir="/output"
for val in "$@"
do
if [[ $val == "-o" ]] || [[ $val == "--output" ]]; then
output_dir="$2"
elif [[ $val == "--keep" ]]; then
keep="1"
fi
done
cmd="$@"
curr=$(pwd)
tmpdir=$(mktemp -d --tmpdir=$output_dir)
chmod a+rx $tmpdir
echo "output_dir: $tmpdir"
pushd $tmpdir
wb-manager init capture
uwsgi uwsgi.ini &> /dev/null &
uwsgi $curr/uwsgi.ini &> /dev/null &
# needed for chrome
export QT_X11_NO_MITSHM=1
cmd="$@"
su zimit -c "node $curr/index.js $cmd"
su zimit -c "node index.js $cmd"
popd
# if not keeping, delete temp dir
if [[ -z $keep ]]; then
echo "Removing temp dir $tmpdir"
rm -rf $tmpdir
fi

View File

@ -22,6 +22,6 @@ env = GEVENT_MONKEY_PATCH=1
processes = 8
# specify config file here
env = PYWB_CONFIG_FILE=config.yaml
env = PYWB_CONFIG_FILE=/app/config.yaml
wsgi = pywb.apps.wayback