From b00c4262a7af9c88178a1c22a30dd9b0bd0e320f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 21 Sep 2020 07:14:23 +0000 Subject: [PATCH] add --limit param for max URLs to be captured add 'html check', only load HTML in browsers, load other content-types directly via pywb, esp for PDFs (work on #8) improved error handling --- Dockerfile | 4 +-- README.md | 1 + index.js | 87 ++++++++++++++++++++++++++++++++++++++++++++++------ package.json | 2 ++ run.sh | 8 +++-- yarn.lock | 17 ++++++++++ 6 files changed, 105 insertions(+), 14 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2a9360d..af6955c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -28,18 +28,18 @@ RUN useradd zimit --shell /bin/bash --create-home \ WORKDIR /app -ADD index.js /app/ ADD package.json /app/ RUN chown -R zimit /app -USER zimit +#USER zimit RUN yarn install ADD config.yaml /app/ ADD uwsgi.ini /app/ ADD run.sh /app/ +ADD index.js /app/ ENTRYPOINT ["/app/run.sh"] diff --git a/README.md b/README.md index 0823579..5a23eff 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,7 @@ The image accepts the following parameters: - `--wait-until` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example). - `--name` - Name of ZIM file (defaults to the hostname of the URL) - `--output` - output directory (defaults to `/output`) +- `--limit U` - Limit capture to at most U URLs The following is an example usage. The `--cap-add` and `--shm-size` flags are needed to run Chrome in Docker. diff --git a/index.js b/index.js index a5a2103..d43eddd 100644 --- a/index.js +++ b/index.js @@ -1,6 +1,11 @@ const puppeteer = require("puppeteer-core"); const { Cluster } = require("puppeteer-cluster"); const child_process = require("child_process"); +const fetch = require("node-fetch"); +const AbortController = require("abort-controller"); + +const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"]; + async function run(params) { // Chrome Flags, including proxy server @@ -9,6 +14,9 @@ async function run(params) { `--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}` ]; + // prefix for direct capture via pywb + const capturePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/capture/record/id_/`; + // Puppeter Options const puppeteerOptions = { headless: true, @@ -19,7 +27,7 @@ async function run(params) { // Puppeteer Cluster init and options const cluster = await Cluster.launch({ - concurrency: Cluster.CONCURRENCY_PAGE, + concurrency: Cluster.CONCURRENCY_CONTEXT, maxConcurrency: Number(params.workers) || 1, skipDuplicateUrls: true, puppeteerOptions, @@ -31,45 +39,68 @@ async function run(params) { let seenList = new Set(); const url = params._[0]; - let { waitUntil, timeout, scope } = params; + let { waitUntil, timeout, scope, limit } = params; // waitUntil condition (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options) waitUntil = waitUntil || "load"; // Timeout per page - timeout = timeout || 60000; + timeout = Number(timeout) || 60000; // Scope for crawl, default to the domain of the URL scope = scope || new URL(url).origin + "/"; + // Limit number of pages captured + limit = Number(limit) || 0; + + console.log("Limit: " + limit); + + // links crawled counter + let numLinks = 0; + // Crawl Task cluster.task(async ({page, data}) => { const {url} = data; + if (!await htmlCheck(url, capturePrefix)) { + return; + } + try { await page.goto(url, {waitUntil, timeout}); } catch (e) { console.log(`Load timeout for ${url}`); } - try{ - const result = await page.evaluate(() => { + let results = null; + + try { + results = await page.evaluate(() => { return [...document.querySelectorAll('a[href]')].map(el => ({ url: el.href})) }); + } catch (e) { + console.warn("Link Extraction failed", e); + return; + } - for (data of result) { + try { + for (data of results) { const newUrl = shouldCrawl(scope, seenList, data.url); + if (newUrl) { seenList.add(newUrl); + if (numLinks++ >= limit && limit > 0) { + break; + } cluster.queue({url: newUrl}); } } } catch (e) { - console.warn("error"); - console.warn(e); + console.log("Queuing Error: " + e); } }); + numLinks++; cluster.queue({url}); await cluster.idle(); @@ -118,6 +149,43 @@ function shouldCrawl(scope, seenList, url) { return url; } +async function htmlCheck(url, capturePrefix) { + try { + const resp = await fetch(url, {method: "HEAD"}); + + if (resp.status >= 400) { + console.log(`Skipping ${url}, invalid status ${resp.status}`); + return false; + } + + const contentType = resp.headers.get("Content-Type"); + + // just load if no content-type + if (!contentType) { + return true; + } + + const mime = contentType.split(";")[0]; + + if (HTML_TYPES.includes(mime)) { + return true; + } + + // capture directly + console.log(`Direct capture: ${capturePrefix}${url}`); + const abort = new AbortController(); + const signal = abort.signal; + const resp2 = await fetch(capturePrefix + url, {signal}); + abort.abort(); + + return false; + } catch(e) { + console.log("HTML Check error", e); + // can't confirm not html, so try in browser + return true; + } +} + async function main() { const params = require('yargs').argv; @@ -127,7 +195,8 @@ async function main() { await run(params); process.exit(0); } catch(e) { - console.log(e); + console.error("Crawl failed, ZIM creation skipped"); + console.error(e); process.exit(1); } } diff --git a/package.json b/package.json index 93e01f5..c304660 100644 --- a/package.json +++ b/package.json @@ -6,6 +6,8 @@ "author": "Ilya Kreymer ", "license": "MIT", "dependencies": { + "abort-controller": "^3.0.0", + "node-fetch": "^2.6.1", "puppeteer-cluster": "^0.22.0", "puppeteer-core": "^5.3.0", "yargs": "^16.0.3" diff --git a/run.sh b/run.sh index b9bd9fc..74d0752 100755 --- a/run.sh +++ b/run.sh @@ -1,10 +1,12 @@ #!/bin/bash -URL="$1" - wb-manager init capture uwsgi uwsgi.ini &> /dev/null & # needed for chrome export QT_X11_NO_MITSHM=1 -node index.js "$@" +cmd="$@" + +su zimit -c "node index.js $cmd" + + diff --git a/yarn.lock b/yarn.lock index 039a6f7..d793a9a 100644 --- a/yarn.lock +++ b/yarn.lock @@ -19,6 +19,13 @@ dependencies: "@types/node" "*" +abort-controller@^3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/abort-controller/-/abort-controller-3.0.0.tgz#eaf54d53b62bae4138e809ca225c8439a6efb392" + integrity sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg== + dependencies: + event-target-shim "^5.0.0" + agent-base@5: version "5.1.1" resolved "https://registry.yarnpkg.com/agent-base/-/agent-base-5.1.1.tgz#e8fb3f242959db44d63be665db7a8e739537a32c" @@ -137,6 +144,11 @@ escalade@^3.0.2: resolved "https://registry.yarnpkg.com/escalade/-/escalade-3.1.0.tgz#e8e2d7c7a8b76f6ee64c2181d6b8151441602d4e" integrity sha512-mAk+hPSO8fLDkhV7V0dXazH5pDc6MrjBTPyD3VeKzxnVFjH1MIxbCdqGZB9O8+EwWakZs3ZCbDS4IpRt79V1ig== +event-target-shim@^5.0.0: + version "5.0.1" + resolved "https://registry.yarnpkg.com/event-target-shim/-/event-target-shim-5.0.1.tgz#5d4d3ebdf9583d63a5333ce2deb7480ab2b05789" + integrity sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ== + extract-zip@^2.0.0: version "2.0.1" resolved "https://registry.yarnpkg.com/extract-zip/-/extract-zip-2.0.1.tgz#663dca56fe46df890d5f131ef4a06d22bb8ba13a" @@ -257,6 +269,11 @@ ms@2.1.2: resolved "https://registry.yarnpkg.com/ms/-/ms-2.1.2.tgz#d09d1f357b443f493382a8eb3ccd183872ae6009" integrity sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w== +node-fetch@^2.6.1: + version "2.6.1" + resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.1.tgz#045bd323631f76ed2e2b55573394416b639a0052" + integrity sha512-V4aYg89jEoVRxRb2fJdAg8FHvI7cEyYdVAh94HH0UIK8oJxUfkjlDQN9RbMx+bEjP7+ggMiFRprSti032Oipxw== + once@^1.3.0, once@^1.3.1, once@^1.4.0: version "1.4.0" resolved "https://registry.yarnpkg.com/once/-/once-1.4.0.tgz#583b1aa775961d4b113ac17d9c50baef9dd76bd1"