diff --git a/Dockerfile b/Dockerfile index 432b827..0d66ed8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,20 +8,14 @@ ENV PROXY_HOST=localhost \ PROXY_CA_FILE=/tmp/proxy-ca.pem \ NO_SOCAT=1 -RUN pip install pywb uwsgi warc2zim +RUN pip install pywb uwsgi # force reinstall of gevent to prevent segfault on uwsgi worker RUN pip install -U gevent -#COPY --from=chrome /opt/google/chrome/ /opt/google/chrome/ - -#COPY --from=chrome /app/ /browser/ +RUN pip install warc2zim==1.0.1 + COPY --from=chrome /usr/lib/x86_64-linux-gnu/ /usr/lib/x86_64-linux-gnu/ COPY --from=chrome /lib/x86_64-linux-gnu/libdbus* /lib/x86_64-linux-gnu/ -#COPY --from=chrome /usr/bin/run_forever /usr/bin/ -#COPY --from=chrome /usr/bin/wget /usr/bin/ -#COPY --from=chrome /usr/bin/certutil /usr/bin/ - -#RUN ln -s /opt/google/chrome/google-chrome /usr/bin/google-chrome RUN useradd zimit --shell /bin/bash --create-home \ && usermod -a -G sudo zimit \ @@ -34,7 +28,7 @@ ADD package.json /app/ RUN chown -R zimit /app -#USER zimit +RUN apt-get update && apt-get install -qqy fonts-stix RUN yarn install diff --git a/README.md b/README.md index f42d6df..b62cc7e 100644 --- a/README.md +++ b/README.md @@ -25,20 +25,21 @@ docker build -t openzim/zimit . The image accepts the following parameters: -- `URL` - the url to be crawled (required) +- `--url URL` - the url to be crawled (required) - `--workers N` - number of crawl workers to be run in parallel - `--wait-until` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example). - `--name` - Name of ZIM file (defaults to the hostname of the URL) - `--output` - output directory (defaults to `/output`) - `--limit U` - Limit capture to at most U URLs - `--exclude ` - skip URLs that match the regex from crawling. Can be specified multiple times. +- `--scroll` - if set, will activate a simple auto-scroll behavior on each page. The following is an example usage. The `--cap-add` and `--shm-size` flags are needed to run Chrome in Docker. Example command: ``` -docker run -v /output:/output --cap-add=SYS_ADMIN --cap-add=NET_ADMIN --shm-size=1gb openzim/zimit URL --name myzimfile --workers 2 --wait-until domcontentloaded +docker run -v /output:/output --cap-add=SYS_ADMIN --cap-add=NET_ADMIN --shm-size=1gb openzim/zimit --url URL --name myzimfile --workers 2 --wait-until domcontentloaded ``` The puppeteer-cluster provides monitoring output which is enabled by default and prints the crawl status to the Docker log. diff --git a/config.yaml b/config.yaml index a072f5a..7f16e3c 100644 --- a/config.yaml +++ b/config.yaml @@ -11,7 +11,7 @@ recorder: live #autoindex: 10 -#enable_auto_fetch: true +enable_auto_fetch: true collections: live: $live diff --git a/index.js b/index.js index 0f4103b..83c0c91 100644 --- a/index.js +++ b/index.js @@ -5,6 +5,8 @@ const fetch = require("node-fetch"); const AbortController = require("abort-controller"); const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"]; +const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"]; +const CHROME_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"; async function run(params) { @@ -36,32 +38,10 @@ async function run(params) { }); // Maintain own seen list - let seenList = new Set(); - const url = params._[0]; + const seenList = new Set(); - let { waitUntil, timeout, scope, limit, exclude } = params; - - // waitUntil condition (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options) - waitUntil = waitUntil || "load"; - - // Timeout per page - timeout = Number(timeout) || 60000; - - // Scope for crawl, default to the domain of the URL - scope = scope || new URL(url).origin + "/"; - - // Limit number of pages captured - limit = Number(limit) || 0; - - if (exclude) { - if (typeof(exclude) === "string") { - exclude = [new RegExp(exclude)]; - } else { - exclude = exclude.map(e => new RegExp(e)); - } - } else { - exclude = []; - } + // params + const { url, waitUntil, timeout, scope, limit, exclude, scroll } = params; console.log("Limit: " + limit); @@ -82,6 +62,14 @@ async function run(params) { console.log(`Load timeout for ${url}`); } + if (scroll) { + try { + await Promise.race([page.evaluate(autoScroll), sleep(30000)]); + } catch (e) { + console.warn("Behavior Failed", e); + } + } + let results = null; try { @@ -119,7 +107,7 @@ async function run(params) { const zimName = params.name || new URL(url).hostname; const zimOutput = params.output || "/output"; - const warc2zim = `warc2zim --url ${url} --name ${zimName} --output ${zimOutput} ./collections/capture/archive/\*.warc.gz`; + const warc2zim = `warc2zim -a --url ${url} --name ${zimName} --output ${zimOutput} ./collections/capture/archive/\*.warc.gz`; console.log("Running: " + warc2zim); @@ -169,7 +157,9 @@ function shouldCrawl(scope, seenList, url, exclude) { async function htmlCheck(url, capturePrefix) { try { - const resp = await fetch(url, {method: "HEAD"}); + const headers = {"User-Agent": CHROME_USER_AGENT}; + + const resp = await fetch(url, {method: "HEAD", headers}); if (resp.status >= 400) { console.log(`Skipping ${url}, invalid status ${resp.status}`); @@ -193,7 +183,7 @@ async function htmlCheck(url, capturePrefix) { console.log(`Direct capture: ${capturePrefix}${url}`); const abort = new AbortController(); const signal = abort.signal; - const resp2 = await fetch(capturePrefix + url, {signal}); + const resp2 = await fetch(capturePrefix + url, {signal, headers}); abort.abort(); return false; @@ -205,9 +195,111 @@ async function htmlCheck(url, capturePrefix) { } +async function autoScroll() { + const canScrollMore = () => + self.scrollY + self.innerHeight < + Math.max( + self.document.body.scrollHeight, + self.document.body.offsetHeight, + self.document.documentElement.clientHeight, + self.document.documentElement.scrollHeight, + self.document.documentElement.offsetHeight + ); + + const scrollOpts = { top: 250, left: 0, behavior: 'auto' }; + + while (canScrollMore()) { + self.scrollBy(scrollOpts); + await new Promise(resolve => setTimeout(resolve, 500)); + } +} + +function sleep(time) { + return new Promise(resolve => setTimeout(resolve, time)); +} + + async function main() { - const params = require('yargs').argv; - console.log(params); + const params = require('yargs') + .usage("zimit [options]") + .options({ + "url": { + alias: "u", + describe: "The URL to start crawling from and main page for ZIM", + demandOption: true, + type: "string", + }, + + "workers": { + alias: "w", + describe: "The number of workers to run in parallel", + demandOption: false, + default: 1, + type: "number", + }, + + "waitUntil": { + describe: "Puppeteer page.goto() condition to wait for before continuing", + default: "load", + }, + + "limit": { + describe: "Limit crawl to this number of pages", + default: 0, + type: "number", + }, + + "timeout": { + describe: "Timeout for each page to load (in millis)", + default: 30000, + type: "number", + }, + + "scope": { + describe: "The scope of current page that should be included in the crawl (defaults to the domain of URL)", + }, + + "exclude": { + describe: "Regex of URLs that should be excluded from the crawl." + }, + + "scroll": { + describe: "If set, will autoscroll to bottom of the page", + type: "boolean", + default: false, + }}).check((argv, option) => { + // Scope for crawl, default to the domain of the URL + const url = new URL(argv.url); + + if (url.protocol !== "http:" && url.protocol != "https:") { + throw new Error("URL must start with http:// or https://"); + } + + if (!argv.scope) { + argv.scope = url.href.slice(0, url.href.lastIndexOf("/") + 1); + } + + // waitUntil condition must be: load, domcontentloaded, networkidle0, networkidle2 + // (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options) + if (!WAIT_UNTIL_OPTS.includes(argv.waitUntil)) { + throw new Error("Invalid waitUntil, must be one of: " + WAIT_UNTIL_OPTS.join(",")); + } + + if (argv.exclude) { + if (typeof(argv.exclude) === "string") { + argv.exclude = [new RegExp(argv.exclude)]; + } else { + argv.exclude = argv.exclude.map(e => new RegExp(e)); + } + } else { + argv.exclude = []; + } + + return true; + }) + .argv; + + console.log("params", params); try { await run(params);