mirror of
https://github.com/openzim/zimit.git
synced 2025-09-21 19:03:19 -04:00
config work: pass remaining config opts to warc2zim, fixes #13
warc2zim check: add runWarc2Zim() to test warc2zim opts before running for validity run script: create temp dir in output dir to ensure all data is on the volume run script: add --keep option to keep temp dir, otherwise delete
This commit is contained in:
parent
e4128c8183
commit
daa2492655
@ -12,7 +12,7 @@ RUN pip install pywb uwsgi
|
||||
# force reinstall of gevent to prevent segfault on uwsgi worker
|
||||
RUN pip install -U gevent
|
||||
|
||||
RUN pip install warc2zim==1.0.1
|
||||
RUN pip install git+https://github.com/openzim/warc2zim@check-args-no-inputs#foo
|
||||
|
||||
COPY --from=chrome /usr/lib/x86_64-linux-gnu/ /usr/lib/x86_64-linux-gnu/
|
||||
COPY --from=chrome /lib/x86_64-linux-gnu/libdbus* /lib/x86_64-linux-gnu/
|
||||
|
46
index.js
46
index.js
@ -43,7 +43,7 @@ async function run(params) {
|
||||
// params
|
||||
const { url, waitUntil, timeout, scope, limit, exclude, scroll } = params;
|
||||
|
||||
console.log("Limit: " + limit);
|
||||
//console.log("Limit: " + limit);
|
||||
|
||||
// links crawled counter
|
||||
let numLinks = 0;
|
||||
@ -103,17 +103,6 @@ async function run(params) {
|
||||
|
||||
await cluster.idle();
|
||||
await cluster.close();
|
||||
|
||||
const zimName = params.name || new URL(url).hostname;
|
||||
const zimOutput = params.output || "/output";
|
||||
|
||||
const warc2zim = `warc2zim -a --url ${url} --name ${zimName} --output ${zimOutput} ./collections/capture/archive/\*.warc.gz`;
|
||||
|
||||
console.log("Running: " + warc2zim);
|
||||
|
||||
//await new Promise((resolve) => {
|
||||
child_process.execSync(warc2zim, {shell: "/bin/bash", stdio: "inherit", stderr: "inherit"});
|
||||
//});
|
||||
}
|
||||
|
||||
|
||||
@ -221,7 +210,7 @@ function sleep(time) {
|
||||
|
||||
async function main() {
|
||||
const params = require('yargs')
|
||||
.usage("zimit <command> [options]")
|
||||
.usage("zimit [options] [warc2zim options]")
|
||||
.options({
|
||||
"url": {
|
||||
alias: "u",
|
||||
@ -299,10 +288,11 @@ async function main() {
|
||||
})
|
||||
.argv;
|
||||
|
||||
console.log("params", params);
|
||||
runWarc2Zim(params, true);
|
||||
|
||||
try {
|
||||
await run(params);
|
||||
runWarc2Zim(params, false);
|
||||
process.exit(0);
|
||||
} catch(e) {
|
||||
console.error("Crawl failed, ZIM creation skipped");
|
||||
@ -311,6 +301,34 @@ async function main() {
|
||||
}
|
||||
}
|
||||
|
||||
function runWarc2Zim(params, checkOnly = true) {
|
||||
const OPTS = ["_", "$0", "keep", "workers", "w", "waitUntil", "wait-until", "limit", "timeout", "scope", "exclude", "scroll"];
|
||||
|
||||
let zimOptsStr = "";
|
||||
|
||||
for (const key of Object.keys(params)) {
|
||||
if (!OPTS.includes(key)) {
|
||||
zimOptsStr += `--${key} ${params[key]} `;
|
||||
}
|
||||
}
|
||||
|
||||
const warc2zimCmd = "warc2zim " + zimOptsStr + (checkOnly ? "" : " ./collections/capture/archive/\*.warc.gz");
|
||||
|
||||
console.log("Running: " + warc2zimCmd);
|
||||
|
||||
const {status} = child_process.spawnSync(warc2zimCmd, {shell: "/bin/bash", stdio: "inherit", stderr: "inherit"});
|
||||
|
||||
if (status && !(checkOnly && status === 100)) {
|
||||
console.error("Invalid warc2zim params, warc2zim exited with: " + status);
|
||||
process.exit(status);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
main();
|
||||
|
||||
|
||||
|
37
run.sh
37
run.sh
@ -1,12 +1,43 @@
|
||||
#!/bin/bash
|
||||
|
||||
output_dir="/output"
|
||||
|
||||
for val in "$@"
|
||||
do
|
||||
if [[ $val == "-o" ]] || [[ $val == "--output" ]]; then
|
||||
output_dir="$2"
|
||||
|
||||
elif [[ $val == "--keep" ]]; then
|
||||
keep="1"
|
||||
fi
|
||||
done
|
||||
|
||||
cmd="$@"
|
||||
|
||||
curr=$(pwd)
|
||||
|
||||
tmpdir=$(mktemp -d --tmpdir=$output_dir)
|
||||
|
||||
chmod a+rx $tmpdir
|
||||
|
||||
echo "output_dir: $tmpdir"
|
||||
|
||||
pushd $tmpdir
|
||||
|
||||
wb-manager init capture
|
||||
uwsgi uwsgi.ini &> /dev/null &
|
||||
uwsgi $curr/uwsgi.ini &> /dev/null &
|
||||
|
||||
# needed for chrome
|
||||
export QT_X11_NO_MITSHM=1
|
||||
|
||||
cmd="$@"
|
||||
su zimit -c "node $curr/index.js $cmd"
|
||||
|
||||
su zimit -c "node index.js $cmd"
|
||||
popd
|
||||
|
||||
|
||||
# if not keeping, delete temp dir
|
||||
if [[ -z $keep ]]; then
|
||||
echo "Removing temp dir $tmpdir"
|
||||
rm -rf $tmpdir
|
||||
fi
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user