mirror of
https://github.com/openzim/zimit.git
synced 2025-09-24 04:30:11 -04:00
add help text/validation for all config options, url now must be passed in with --url
add --scroll boolean option, which activates simple autoscroll behavior use chrome user-agent for manual fetch reenable pywb option cleanup Dockerfile: update to warc2zim 1.0.1, install fonts-stix for math science sites update README
This commit is contained in:
parent
01f2471ab8
commit
e4128c8183
12
Dockerfile
12
Dockerfile
@ -8,20 +8,14 @@ ENV PROXY_HOST=localhost \
|
|||||||
PROXY_CA_FILE=/tmp/proxy-ca.pem \
|
PROXY_CA_FILE=/tmp/proxy-ca.pem \
|
||||||
NO_SOCAT=1
|
NO_SOCAT=1
|
||||||
|
|
||||||
RUN pip install pywb uwsgi warc2zim
|
RUN pip install pywb uwsgi
|
||||||
# force reinstall of gevent to prevent segfault on uwsgi worker
|
# force reinstall of gevent to prevent segfault on uwsgi worker
|
||||||
RUN pip install -U gevent
|
RUN pip install -U gevent
|
||||||
|
|
||||||
#COPY --from=chrome /opt/google/chrome/ /opt/google/chrome/
|
RUN pip install warc2zim==1.0.1
|
||||||
|
|
||||||
#COPY --from=chrome /app/ /browser/
|
|
||||||
COPY --from=chrome /usr/lib/x86_64-linux-gnu/ /usr/lib/x86_64-linux-gnu/
|
COPY --from=chrome /usr/lib/x86_64-linux-gnu/ /usr/lib/x86_64-linux-gnu/
|
||||||
COPY --from=chrome /lib/x86_64-linux-gnu/libdbus* /lib/x86_64-linux-gnu/
|
COPY --from=chrome /lib/x86_64-linux-gnu/libdbus* /lib/x86_64-linux-gnu/
|
||||||
#COPY --from=chrome /usr/bin/run_forever /usr/bin/
|
|
||||||
#COPY --from=chrome /usr/bin/wget /usr/bin/
|
|
||||||
#COPY --from=chrome /usr/bin/certutil /usr/bin/
|
|
||||||
|
|
||||||
#RUN ln -s /opt/google/chrome/google-chrome /usr/bin/google-chrome
|
|
||||||
|
|
||||||
RUN useradd zimit --shell /bin/bash --create-home \
|
RUN useradd zimit --shell /bin/bash --create-home \
|
||||||
&& usermod -a -G sudo zimit \
|
&& usermod -a -G sudo zimit \
|
||||||
@ -34,7 +28,7 @@ ADD package.json /app/
|
|||||||
|
|
||||||
RUN chown -R zimit /app
|
RUN chown -R zimit /app
|
||||||
|
|
||||||
#USER zimit
|
RUN apt-get update && apt-get install -qqy fonts-stix
|
||||||
|
|
||||||
RUN yarn install
|
RUN yarn install
|
||||||
|
|
||||||
|
@ -25,20 +25,21 @@ docker build -t openzim/zimit .
|
|||||||
|
|
||||||
The image accepts the following parameters:
|
The image accepts the following parameters:
|
||||||
|
|
||||||
- `URL` - the url to be crawled (required)
|
- `--url URL` - the url to be crawled (required)
|
||||||
- `--workers N` - number of crawl workers to be run in parallel
|
- `--workers N` - number of crawl workers to be run in parallel
|
||||||
- `--wait-until` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example).
|
- `--wait-until` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example).
|
||||||
- `--name` - Name of ZIM file (defaults to the hostname of the URL)
|
- `--name` - Name of ZIM file (defaults to the hostname of the URL)
|
||||||
- `--output` - output directory (defaults to `/output`)
|
- `--output` - output directory (defaults to `/output`)
|
||||||
- `--limit U` - Limit capture to at most U URLs
|
- `--limit U` - Limit capture to at most U URLs
|
||||||
- `--exclude <regex>` - skip URLs that match the regex from crawling. Can be specified multiple times.
|
- `--exclude <regex>` - skip URLs that match the regex from crawling. Can be specified multiple times.
|
||||||
|
- `--scroll` - if set, will activate a simple auto-scroll behavior on each page.
|
||||||
|
|
||||||
The following is an example usage. The `--cap-add` and `--shm-size` flags are needed to run Chrome in Docker.
|
The following is an example usage. The `--cap-add` and `--shm-size` flags are needed to run Chrome in Docker.
|
||||||
|
|
||||||
Example command:
|
Example command:
|
||||||
|
|
||||||
```
|
```
|
||||||
docker run -v /output:/output --cap-add=SYS_ADMIN --cap-add=NET_ADMIN --shm-size=1gb openzim/zimit URL --name myzimfile --workers 2 --wait-until domcontentloaded
|
docker run -v /output:/output --cap-add=SYS_ADMIN --cap-add=NET_ADMIN --shm-size=1gb openzim/zimit --url URL --name myzimfile --workers 2 --wait-until domcontentloaded
|
||||||
```
|
```
|
||||||
|
|
||||||
The puppeteer-cluster provides monitoring output which is enabled by default and prints the crawl status to the Docker log.
|
The puppeteer-cluster provides monitoring output which is enabled by default and prints the crawl status to the Docker log.
|
||||||
|
@ -11,7 +11,7 @@ recorder: live
|
|||||||
|
|
||||||
#autoindex: 10
|
#autoindex: 10
|
||||||
|
|
||||||
#enable_auto_fetch: true
|
enable_auto_fetch: true
|
||||||
|
|
||||||
collections:
|
collections:
|
||||||
live: $live
|
live: $live
|
||||||
|
152
index.js
152
index.js
@ -5,6 +5,8 @@ const fetch = require("node-fetch");
|
|||||||
const AbortController = require("abort-controller");
|
const AbortController = require("abort-controller");
|
||||||
|
|
||||||
const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
|
const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
|
||||||
|
const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
|
||||||
|
const CHROME_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36";
|
||||||
|
|
||||||
|
|
||||||
async function run(params) {
|
async function run(params) {
|
||||||
@ -36,32 +38,10 @@ async function run(params) {
|
|||||||
});
|
});
|
||||||
|
|
||||||
// Maintain own seen list
|
// Maintain own seen list
|
||||||
let seenList = new Set();
|
const seenList = new Set();
|
||||||
const url = params._[0];
|
|
||||||
|
|
||||||
let { waitUntil, timeout, scope, limit, exclude } = params;
|
// params
|
||||||
|
const { url, waitUntil, timeout, scope, limit, exclude, scroll } = params;
|
||||||
// waitUntil condition (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options)
|
|
||||||
waitUntil = waitUntil || "load";
|
|
||||||
|
|
||||||
// Timeout per page
|
|
||||||
timeout = Number(timeout) || 60000;
|
|
||||||
|
|
||||||
// Scope for crawl, default to the domain of the URL
|
|
||||||
scope = scope || new URL(url).origin + "/";
|
|
||||||
|
|
||||||
// Limit number of pages captured
|
|
||||||
limit = Number(limit) || 0;
|
|
||||||
|
|
||||||
if (exclude) {
|
|
||||||
if (typeof(exclude) === "string") {
|
|
||||||
exclude = [new RegExp(exclude)];
|
|
||||||
} else {
|
|
||||||
exclude = exclude.map(e => new RegExp(e));
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
exclude = [];
|
|
||||||
}
|
|
||||||
|
|
||||||
console.log("Limit: " + limit);
|
console.log("Limit: " + limit);
|
||||||
|
|
||||||
@ -82,6 +62,14 @@ async function run(params) {
|
|||||||
console.log(`Load timeout for ${url}`);
|
console.log(`Load timeout for ${url}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (scroll) {
|
||||||
|
try {
|
||||||
|
await Promise.race([page.evaluate(autoScroll), sleep(30000)]);
|
||||||
|
} catch (e) {
|
||||||
|
console.warn("Behavior Failed", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let results = null;
|
let results = null;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@ -119,7 +107,7 @@ async function run(params) {
|
|||||||
const zimName = params.name || new URL(url).hostname;
|
const zimName = params.name || new URL(url).hostname;
|
||||||
const zimOutput = params.output || "/output";
|
const zimOutput = params.output || "/output";
|
||||||
|
|
||||||
const warc2zim = `warc2zim --url ${url} --name ${zimName} --output ${zimOutput} ./collections/capture/archive/\*.warc.gz`;
|
const warc2zim = `warc2zim -a --url ${url} --name ${zimName} --output ${zimOutput} ./collections/capture/archive/\*.warc.gz`;
|
||||||
|
|
||||||
console.log("Running: " + warc2zim);
|
console.log("Running: " + warc2zim);
|
||||||
|
|
||||||
@ -169,7 +157,9 @@ function shouldCrawl(scope, seenList, url, exclude) {
|
|||||||
|
|
||||||
async function htmlCheck(url, capturePrefix) {
|
async function htmlCheck(url, capturePrefix) {
|
||||||
try {
|
try {
|
||||||
const resp = await fetch(url, {method: "HEAD"});
|
const headers = {"User-Agent": CHROME_USER_AGENT};
|
||||||
|
|
||||||
|
const resp = await fetch(url, {method: "HEAD", headers});
|
||||||
|
|
||||||
if (resp.status >= 400) {
|
if (resp.status >= 400) {
|
||||||
console.log(`Skipping ${url}, invalid status ${resp.status}`);
|
console.log(`Skipping ${url}, invalid status ${resp.status}`);
|
||||||
@ -193,7 +183,7 @@ async function htmlCheck(url, capturePrefix) {
|
|||||||
console.log(`Direct capture: ${capturePrefix}${url}`);
|
console.log(`Direct capture: ${capturePrefix}${url}`);
|
||||||
const abort = new AbortController();
|
const abort = new AbortController();
|
||||||
const signal = abort.signal;
|
const signal = abort.signal;
|
||||||
const resp2 = await fetch(capturePrefix + url, {signal});
|
const resp2 = await fetch(capturePrefix + url, {signal, headers});
|
||||||
abort.abort();
|
abort.abort();
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
@ -205,9 +195,111 @@ async function htmlCheck(url, capturePrefix) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async function autoScroll() {
|
||||||
|
const canScrollMore = () =>
|
||||||
|
self.scrollY + self.innerHeight <
|
||||||
|
Math.max(
|
||||||
|
self.document.body.scrollHeight,
|
||||||
|
self.document.body.offsetHeight,
|
||||||
|
self.document.documentElement.clientHeight,
|
||||||
|
self.document.documentElement.scrollHeight,
|
||||||
|
self.document.documentElement.offsetHeight
|
||||||
|
);
|
||||||
|
|
||||||
|
const scrollOpts = { top: 250, left: 0, behavior: 'auto' };
|
||||||
|
|
||||||
|
while (canScrollMore()) {
|
||||||
|
self.scrollBy(scrollOpts);
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 500));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function sleep(time) {
|
||||||
|
return new Promise(resolve => setTimeout(resolve, time));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
async function main() {
|
async function main() {
|
||||||
const params = require('yargs').argv;
|
const params = require('yargs')
|
||||||
console.log(params);
|
.usage("zimit <command> [options]")
|
||||||
|
.options({
|
||||||
|
"url": {
|
||||||
|
alias: "u",
|
||||||
|
describe: "The URL to start crawling from and main page for ZIM",
|
||||||
|
demandOption: true,
|
||||||
|
type: "string",
|
||||||
|
},
|
||||||
|
|
||||||
|
"workers": {
|
||||||
|
alias: "w",
|
||||||
|
describe: "The number of workers to run in parallel",
|
||||||
|
demandOption: false,
|
||||||
|
default: 1,
|
||||||
|
type: "number",
|
||||||
|
},
|
||||||
|
|
||||||
|
"waitUntil": {
|
||||||
|
describe: "Puppeteer page.goto() condition to wait for before continuing",
|
||||||
|
default: "load",
|
||||||
|
},
|
||||||
|
|
||||||
|
"limit": {
|
||||||
|
describe: "Limit crawl to this number of pages",
|
||||||
|
default: 0,
|
||||||
|
type: "number",
|
||||||
|
},
|
||||||
|
|
||||||
|
"timeout": {
|
||||||
|
describe: "Timeout for each page to load (in millis)",
|
||||||
|
default: 30000,
|
||||||
|
type: "number",
|
||||||
|
},
|
||||||
|
|
||||||
|
"scope": {
|
||||||
|
describe: "The scope of current page that should be included in the crawl (defaults to the domain of URL)",
|
||||||
|
},
|
||||||
|
|
||||||
|
"exclude": {
|
||||||
|
describe: "Regex of URLs that should be excluded from the crawl."
|
||||||
|
},
|
||||||
|
|
||||||
|
"scroll": {
|
||||||
|
describe: "If set, will autoscroll to bottom of the page",
|
||||||
|
type: "boolean",
|
||||||
|
default: false,
|
||||||
|
}}).check((argv, option) => {
|
||||||
|
// Scope for crawl, default to the domain of the URL
|
||||||
|
const url = new URL(argv.url);
|
||||||
|
|
||||||
|
if (url.protocol !== "http:" && url.protocol != "https:") {
|
||||||
|
throw new Error("URL must start with http:// or https://");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!argv.scope) {
|
||||||
|
argv.scope = url.href.slice(0, url.href.lastIndexOf("/") + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// waitUntil condition must be: load, domcontentloaded, networkidle0, networkidle2
|
||||||
|
// (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options)
|
||||||
|
if (!WAIT_UNTIL_OPTS.includes(argv.waitUntil)) {
|
||||||
|
throw new Error("Invalid waitUntil, must be one of: " + WAIT_UNTIL_OPTS.join(","));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argv.exclude) {
|
||||||
|
if (typeof(argv.exclude) === "string") {
|
||||||
|
argv.exclude = [new RegExp(argv.exclude)];
|
||||||
|
} else {
|
||||||
|
argv.exclude = argv.exclude.map(e => new RegExp(e));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
argv.exclude = [];
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
})
|
||||||
|
.argv;
|
||||||
|
|
||||||
|
console.log("params", params);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await run(params);
|
await run(params);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user