mirror of
https://github.com/openzim/zimit.git
synced 2025-09-22 03:12:04 -04:00
move warc2zim to be launched by node process
This commit is contained in:
parent
1de577bd78
commit
4e04645e6b
@ -24,12 +24,15 @@ The image accepts the following parameters:
|
||||
- "<URL>" - the url to be crawled (required)
|
||||
- `--workers N` - number of crawl workers to be run in parallel
|
||||
- `--wait-until` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example).
|
||||
- `--name` - Name of ZIM file (defaults to the hostname of the URL)
|
||||
- `--output` - output directory (defaults to `/output`)
|
||||
|
||||
|
||||
|
||||
Example command:
|
||||
|
||||
```
|
||||
docker run -d -e NAME=myzimfile -v /output:/output --cap-add=SYS_ADMIN --cap-add=NET_ADMIN --shm-size=1gb openzim/zimit "<URL>" --workers 2 --wait-until domcontentloaded
|
||||
docker run -v /output:/output --cap-add=SYS_ADMIN --cap-add=NET_ADMIN --shm-size=1gb openzim/zimit "<URL>" --name myzimfile --workers 2 --wait-until domcontentloaded
|
||||
```
|
||||
|
||||
<hr>
|
||||
|
12
index.js
12
index.js
@ -1,5 +1,6 @@
|
||||
const puppeteer = require("puppeteer-core");
|
||||
const { Cluster } = require("puppeteer-cluster");
|
||||
const child_process = require("child_process");
|
||||
|
||||
async function run(params) {
|
||||
const args = [
|
||||
@ -66,6 +67,17 @@ async function run(params) {
|
||||
|
||||
await cluster.idle();
|
||||
await cluster.close();
|
||||
|
||||
const zimName = params.name || new URL(url).hostname;
|
||||
const zimOutput = params.output || "/output";
|
||||
|
||||
const warc2zim = `warc2zim --url ${url} --name ${zimName} --output ${zimOutput} ./collections/capture/archive/\*.warc.gz`;
|
||||
|
||||
console.log("Running: " + warc2zim);
|
||||
|
||||
//await new Promise((resolve) => {
|
||||
child_process.execSync(warc2zim, {shell: "/bin/bash", stdio: "inherit", stderr: "inherit"});
|
||||
//});
|
||||
}
|
||||
|
||||
|
||||
|
21
run.sh
21
run.sh
@ -4,26 +4,7 @@ URL="$1"
|
||||
wb-manager init capture
|
||||
uwsgi uwsgi.ini &> /dev/null &
|
||||
|
||||
#/browser/browser_entrypoint.sh /browser/run.sh &
|
||||
#if [[ -n "$PROXY_CA_FILE" && -f "$PROXY_CA_FILE" && -n "$PROXY_HOST" ]]; then
|
||||
# rm -rf "$HOME/.pki/nssdb"
|
||||
# mkdir -p "$HOME/.pki/nssdb"
|
||||
# certutil -d "$HOME/.pki/nssdb" -N
|
||||
# certutil -d "sql:$HOME/.pki/nssdb" -A -t "C,," -n "Proxy" -i "$PROXY_CA_FILE"
|
||||
# rm "$PROXY_CA_FILE"
|
||||
#fi
|
||||
|
||||
#mkdir ~/.config/
|
||||
#mkdir ~/.config/google-chrome
|
||||
#touch ~/.config/google-chrome/First\ Run
|
||||
|
||||
# needed for chrome
|
||||
export QT_X11_NO_MITSHM=1
|
||||
|
||||
node index.js "$@"
|
||||
|
||||
NAME=${NAME:=zimfile}
|
||||
|
||||
|
||||
warc2zim --url $URL --name $NAME --output=/output ./collections/capture/archive/*.warc.gz
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user