add --limit param for max URLs to be captured

add 'html check', only load HTML in browsers, load other content-types directly via pywb, esp for PDFs (work on #8)
improved error handling
This commit is contained in:
Ilya Kreymer 2020-09-21 07:14:23 +00:00
parent ff2773677c
commit b00c4262a7
6 changed files with 105 additions and 14 deletions

View File

@ -28,18 +28,18 @@ RUN useradd zimit --shell /bin/bash --create-home \
WORKDIR /app
ADD index.js /app/
ADD package.json /app/
RUN chown -R zimit /app
USER zimit
#USER zimit
RUN yarn install
ADD config.yaml /app/
ADD uwsgi.ini /app/
ADD run.sh /app/
ADD index.js /app/
ENTRYPOINT ["/app/run.sh"]

View File

@ -30,6 +30,7 @@ The image accepts the following parameters:
- `--wait-until` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example).
- `--name` - Name of ZIM file (defaults to the hostname of the URL)
- `--output` - output directory (defaults to `/output`)
- `--limit U` - Limit capture to at most U URLs
The following is an example usage. The `--cap-add` and `--shm-size` flags are needed to run Chrome in Docker.

View File

@ -1,6 +1,11 @@
const puppeteer = require("puppeteer-core");
const { Cluster } = require("puppeteer-cluster");
const child_process = require("child_process");
const fetch = require("node-fetch");
const AbortController = require("abort-controller");
const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
async function run(params) {
// Chrome Flags, including proxy server
@ -9,6 +14,9 @@ async function run(params) {
`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`
];
// prefix for direct capture via pywb
const capturePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/capture/record/id_/`;
// Puppeter Options
const puppeteerOptions = {
headless: true,
@ -19,7 +27,7 @@ async function run(params) {
// Puppeteer Cluster init and options
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_PAGE,
concurrency: Cluster.CONCURRENCY_CONTEXT,
maxConcurrency: Number(params.workers) || 1,
skipDuplicateUrls: true,
puppeteerOptions,
@ -31,45 +39,68 @@ async function run(params) {
let seenList = new Set();
const url = params._[0];
let { waitUntil, timeout, scope } = params;
let { waitUntil, timeout, scope, limit } = params;
// waitUntil condition (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options)
waitUntil = waitUntil || "load";
// Timeout per page
timeout = timeout || 60000;
timeout = Number(timeout) || 60000;
// Scope for crawl, default to the domain of the URL
scope = scope || new URL(url).origin + "/";
// Limit number of pages captured
limit = Number(limit) || 0;
console.log("Limit: " + limit);
// links crawled counter
let numLinks = 0;
// Crawl Task
cluster.task(async ({page, data}) => {
const {url} = data;
if (!await htmlCheck(url, capturePrefix)) {
return;
}
try {
await page.goto(url, {waitUntil, timeout});
} catch (e) {
console.log(`Load timeout for ${url}`);
}
try{
const result = await page.evaluate(() => {
let results = null;
try {
results = await page.evaluate(() => {
return [...document.querySelectorAll('a[href]')].map(el => ({ url: el.href}))
});
} catch (e) {
console.warn("Link Extraction failed", e);
return;
}
for (data of result) {
try {
for (data of results) {
const newUrl = shouldCrawl(scope, seenList, data.url);
if (newUrl) {
seenList.add(newUrl);
if (numLinks++ >= limit && limit > 0) {
break;
}
cluster.queue({url: newUrl});
}
}
} catch (e) {
console.warn("error");
console.warn(e);
console.log("Queuing Error: " + e);
}
});
numLinks++;
cluster.queue({url});
await cluster.idle();
@ -118,6 +149,43 @@ function shouldCrawl(scope, seenList, url) {
return url;
}
async function htmlCheck(url, capturePrefix) {
try {
const resp = await fetch(url, {method: "HEAD"});
if (resp.status >= 400) {
console.log(`Skipping ${url}, invalid status ${resp.status}`);
return false;
}
const contentType = resp.headers.get("Content-Type");
// just load if no content-type
if (!contentType) {
return true;
}
const mime = contentType.split(";")[0];
if (HTML_TYPES.includes(mime)) {
return true;
}
// capture directly
console.log(`Direct capture: ${capturePrefix}${url}`);
const abort = new AbortController();
const signal = abort.signal;
const resp2 = await fetch(capturePrefix + url, {signal});
abort.abort();
return false;
} catch(e) {
console.log("HTML Check error", e);
// can't confirm not html, so try in browser
return true;
}
}
async function main() {
const params = require('yargs').argv;
@ -127,7 +195,8 @@ async function main() {
await run(params);
process.exit(0);
} catch(e) {
console.log(e);
console.error("Crawl failed, ZIM creation skipped");
console.error(e);
process.exit(1);
}
}

View File

@ -6,6 +6,8 @@
"author": "Ilya Kreymer <ikreymer@gmail.com>",
"license": "MIT",
"dependencies": {
"abort-controller": "^3.0.0",
"node-fetch": "^2.6.1",
"puppeteer-cluster": "^0.22.0",
"puppeteer-core": "^5.3.0",
"yargs": "^16.0.3"

8
run.sh
View File

@ -1,10 +1,12 @@
#!/bin/bash
URL="$1"
wb-manager init capture
uwsgi uwsgi.ini &> /dev/null &
# needed for chrome
export QT_X11_NO_MITSHM=1
node index.js "$@"
cmd="$@"
su zimit -c "node index.js $cmd"

View File

@ -19,6 +19,13 @@
dependencies:
"@types/node" "*"
abort-controller@^3.0.0:
version "3.0.0"
resolved "https://registry.yarnpkg.com/abort-controller/-/abort-controller-3.0.0.tgz#eaf54d53b62bae4138e809ca225c8439a6efb392"
integrity sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==
dependencies:
event-target-shim "^5.0.0"
agent-base@5:
version "5.1.1"
resolved "https://registry.yarnpkg.com/agent-base/-/agent-base-5.1.1.tgz#e8fb3f242959db44d63be665db7a8e739537a32c"
@ -137,6 +144,11 @@ escalade@^3.0.2:
resolved "https://registry.yarnpkg.com/escalade/-/escalade-3.1.0.tgz#e8e2d7c7a8b76f6ee64c2181d6b8151441602d4e"
integrity sha512-mAk+hPSO8fLDkhV7V0dXazH5pDc6MrjBTPyD3VeKzxnVFjH1MIxbCdqGZB9O8+EwWakZs3ZCbDS4IpRt79V1ig==
event-target-shim@^5.0.0:
version "5.0.1"
resolved "https://registry.yarnpkg.com/event-target-shim/-/event-target-shim-5.0.1.tgz#5d4d3ebdf9583d63a5333ce2deb7480ab2b05789"
integrity sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==
extract-zip@^2.0.0:
version "2.0.1"
resolved "https://registry.yarnpkg.com/extract-zip/-/extract-zip-2.0.1.tgz#663dca56fe46df890d5f131ef4a06d22bb8ba13a"
@ -257,6 +269,11 @@ ms@2.1.2:
resolved "https://registry.yarnpkg.com/ms/-/ms-2.1.2.tgz#d09d1f357b443f493382a8eb3ccd183872ae6009"
integrity sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==
node-fetch@^2.6.1:
version "2.6.1"
resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.1.tgz#045bd323631f76ed2e2b55573394416b639a0052"
integrity sha512-V4aYg89jEoVRxRb2fJdAg8FHvI7cEyYdVAh94HH0UIK8oJxUfkjlDQN9RbMx+bEjP7+ggMiFRprSti032Oipxw==
once@^1.3.0, once@^1.3.1, once@^1.4.0:
version "1.4.0"
resolved "https://registry.yarnpkg.com/once/-/once-1.4.0.tgz#583b1aa775961d4b113ac17d9c50baef9dd76bd1"