move warc2zim to be launched by node process

This commit is contained in:
Ilya Kreymer 2020-09-19 22:47:19 +00:00
parent 1de577bd78
commit 4e04645e6b
3 changed files with 17 additions and 21 deletions

View File

@ -24,12 +24,15 @@ The image accepts the following parameters:
- "<URL>" - the url to be crawled (required)
- `--workers N` - number of crawl workers to be run in parallel
- `--wait-until` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example).
- `--name` - Name of ZIM file (defaults to the hostname of the URL)
- `--output` - output directory (defaults to `/output`)
Example command:
```
docker run -d -e NAME=myzimfile -v /output:/output --cap-add=SYS_ADMIN --cap-add=NET_ADMIN --shm-size=1gb openzim/zimit "<URL>" --workers 2 --wait-until domcontentloaded
docker run -v /output:/output --cap-add=SYS_ADMIN --cap-add=NET_ADMIN --shm-size=1gb openzim/zimit "<URL>" --name myzimfile --workers 2 --wait-until domcontentloaded
```
<hr>

View File

@ -1,5 +1,6 @@
const puppeteer = require("puppeteer-core");
const { Cluster } = require("puppeteer-cluster");
const child_process = require("child_process");
async function run(params) {
const args = [
@ -66,6 +67,17 @@ async function run(params) {
await cluster.idle();
await cluster.close();
const zimName = params.name || new URL(url).hostname;
const zimOutput = params.output || "/output";
const warc2zim = `warc2zim --url ${url} --name ${zimName} --output ${zimOutput} ./collections/capture/archive/\*.warc.gz`;
console.log("Running: " + warc2zim);
//await new Promise((resolve) => {
child_process.execSync(warc2zim, {shell: "/bin/bash", stdio: "inherit", stderr: "inherit"});
//});
}

21
run.sh
View File

@ -4,26 +4,7 @@ URL="$1"
wb-manager init capture
uwsgi uwsgi.ini &> /dev/null &
#/browser/browser_entrypoint.sh /browser/run.sh &
#if [[ -n "$PROXY_CA_FILE" && -f "$PROXY_CA_FILE" && -n "$PROXY_HOST" ]]; then
# rm -rf "$HOME/.pki/nssdb"
# mkdir -p "$HOME/.pki/nssdb"
# certutil -d "$HOME/.pki/nssdb" -N
# certutil -d "sql:$HOME/.pki/nssdb" -A -t "C,," -n "Proxy" -i "$PROXY_CA_FILE"
# rm "$PROXY_CA_FILE"
#fi
#mkdir ~/.config/
#mkdir ~/.config/google-chrome
#touch ~/.config/google-chrome/First\ Run
# needed for chrome
export QT_X11_NO_MITSHM=1
node index.js "$@"
NAME=${NAME:=zimfile}
warc2zim --url $URL --name $NAME --output=/output ./collections/capture/archive/*.warc.gz