mirror of
https://github.com/openzim/zimit.git
synced 2025-09-22 03:12:04 -04:00
add --limit param for max URLs to be captured
add 'html check', only load HTML in browsers, load other content-types directly via pywb, esp for PDFs (work on #8) improved error handling
This commit is contained in:
parent
ff2773677c
commit
b00c4262a7
@ -28,18 +28,18 @@ RUN useradd zimit --shell /bin/bash --create-home \
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ADD index.js /app/
|
||||
ADD package.json /app/
|
||||
|
||||
RUN chown -R zimit /app
|
||||
|
||||
USER zimit
|
||||
#USER zimit
|
||||
|
||||
RUN yarn install
|
||||
|
||||
ADD config.yaml /app/
|
||||
ADD uwsgi.ini /app/
|
||||
ADD run.sh /app/
|
||||
ADD index.js /app/
|
||||
|
||||
ENTRYPOINT ["/app/run.sh"]
|
||||
|
||||
|
@ -30,6 +30,7 @@ The image accepts the following parameters:
|
||||
- `--wait-until` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example).
|
||||
- `--name` - Name of ZIM file (defaults to the hostname of the URL)
|
||||
- `--output` - output directory (defaults to `/output`)
|
||||
- `--limit U` - Limit capture to at most U URLs
|
||||
|
||||
The following is an example usage. The `--cap-add` and `--shm-size` flags are needed to run Chrome in Docker.
|
||||
|
||||
|
87
index.js
87
index.js
@ -1,6 +1,11 @@
|
||||
const puppeteer = require("puppeteer-core");
|
||||
const { Cluster } = require("puppeteer-cluster");
|
||||
const child_process = require("child_process");
|
||||
const fetch = require("node-fetch");
|
||||
const AbortController = require("abort-controller");
|
||||
|
||||
const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
|
||||
|
||||
|
||||
async function run(params) {
|
||||
// Chrome Flags, including proxy server
|
||||
@ -9,6 +14,9 @@ async function run(params) {
|
||||
`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`
|
||||
];
|
||||
|
||||
// prefix for direct capture via pywb
|
||||
const capturePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/capture/record/id_/`;
|
||||
|
||||
// Puppeter Options
|
||||
const puppeteerOptions = {
|
||||
headless: true,
|
||||
@ -19,7 +27,7 @@ async function run(params) {
|
||||
|
||||
// Puppeteer Cluster init and options
|
||||
const cluster = await Cluster.launch({
|
||||
concurrency: Cluster.CONCURRENCY_PAGE,
|
||||
concurrency: Cluster.CONCURRENCY_CONTEXT,
|
||||
maxConcurrency: Number(params.workers) || 1,
|
||||
skipDuplicateUrls: true,
|
||||
puppeteerOptions,
|
||||
@ -31,45 +39,68 @@ async function run(params) {
|
||||
let seenList = new Set();
|
||||
const url = params._[0];
|
||||
|
||||
let { waitUntil, timeout, scope } = params;
|
||||
let { waitUntil, timeout, scope, limit } = params;
|
||||
|
||||
// waitUntil condition (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options)
|
||||
waitUntil = waitUntil || "load";
|
||||
|
||||
// Timeout per page
|
||||
timeout = timeout || 60000;
|
||||
timeout = Number(timeout) || 60000;
|
||||
|
||||
// Scope for crawl, default to the domain of the URL
|
||||
scope = scope || new URL(url).origin + "/";
|
||||
|
||||
// Limit number of pages captured
|
||||
limit = Number(limit) || 0;
|
||||
|
||||
console.log("Limit: " + limit);
|
||||
|
||||
// links crawled counter
|
||||
let numLinks = 0;
|
||||
|
||||
// Crawl Task
|
||||
cluster.task(async ({page, data}) => {
|
||||
const {url} = data;
|
||||
|
||||
if (!await htmlCheck(url, capturePrefix)) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
await page.goto(url, {waitUntil, timeout});
|
||||
} catch (e) {
|
||||
console.log(`Load timeout for ${url}`);
|
||||
}
|
||||
|
||||
try{
|
||||
const result = await page.evaluate(() => {
|
||||
let results = null;
|
||||
|
||||
try {
|
||||
results = await page.evaluate(() => {
|
||||
return [...document.querySelectorAll('a[href]')].map(el => ({ url: el.href}))
|
||||
});
|
||||
} catch (e) {
|
||||
console.warn("Link Extraction failed", e);
|
||||
return;
|
||||
}
|
||||
|
||||
for (data of result) {
|
||||
try {
|
||||
for (data of results) {
|
||||
const newUrl = shouldCrawl(scope, seenList, data.url);
|
||||
|
||||
if (newUrl) {
|
||||
seenList.add(newUrl);
|
||||
if (numLinks++ >= limit && limit > 0) {
|
||||
break;
|
||||
}
|
||||
cluster.queue({url: newUrl});
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn("error");
|
||||
console.warn(e);
|
||||
console.log("Queuing Error: " + e);
|
||||
}
|
||||
});
|
||||
|
||||
numLinks++;
|
||||
cluster.queue({url});
|
||||
|
||||
await cluster.idle();
|
||||
@ -118,6 +149,43 @@ function shouldCrawl(scope, seenList, url) {
|
||||
return url;
|
||||
}
|
||||
|
||||
async function htmlCheck(url, capturePrefix) {
|
||||
try {
|
||||
const resp = await fetch(url, {method: "HEAD"});
|
||||
|
||||
if (resp.status >= 400) {
|
||||
console.log(`Skipping ${url}, invalid status ${resp.status}`);
|
||||
return false;
|
||||
}
|
||||
|
||||
const contentType = resp.headers.get("Content-Type");
|
||||
|
||||
// just load if no content-type
|
||||
if (!contentType) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const mime = contentType.split(";")[0];
|
||||
|
||||
if (HTML_TYPES.includes(mime)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// capture directly
|
||||
console.log(`Direct capture: ${capturePrefix}${url}`);
|
||||
const abort = new AbortController();
|
||||
const signal = abort.signal;
|
||||
const resp2 = await fetch(capturePrefix + url, {signal});
|
||||
abort.abort();
|
||||
|
||||
return false;
|
||||
} catch(e) {
|
||||
console.log("HTML Check error", e);
|
||||
// can't confirm not html, so try in browser
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
async function main() {
|
||||
const params = require('yargs').argv;
|
||||
@ -127,7 +195,8 @@ async function main() {
|
||||
await run(params);
|
||||
process.exit(0);
|
||||
} catch(e) {
|
||||
console.log(e);
|
||||
console.error("Crawl failed, ZIM creation skipped");
|
||||
console.error(e);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
@ -6,6 +6,8 @@
|
||||
"author": "Ilya Kreymer <ikreymer@gmail.com>",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"abort-controller": "^3.0.0",
|
||||
"node-fetch": "^2.6.1",
|
||||
"puppeteer-cluster": "^0.22.0",
|
||||
"puppeteer-core": "^5.3.0",
|
||||
"yargs": "^16.0.3"
|
||||
|
8
run.sh
8
run.sh
@ -1,10 +1,12 @@
|
||||
#!/bin/bash
|
||||
URL="$1"
|
||||
|
||||
wb-manager init capture
|
||||
uwsgi uwsgi.ini &> /dev/null &
|
||||
|
||||
# needed for chrome
|
||||
export QT_X11_NO_MITSHM=1
|
||||
|
||||
node index.js "$@"
|
||||
cmd="$@"
|
||||
|
||||
su zimit -c "node index.js $cmd"
|
||||
|
||||
|
||||
|
17
yarn.lock
17
yarn.lock
@ -19,6 +19,13 @@
|
||||
dependencies:
|
||||
"@types/node" "*"
|
||||
|
||||
abort-controller@^3.0.0:
|
||||
version "3.0.0"
|
||||
resolved "https://registry.yarnpkg.com/abort-controller/-/abort-controller-3.0.0.tgz#eaf54d53b62bae4138e809ca225c8439a6efb392"
|
||||
integrity sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==
|
||||
dependencies:
|
||||
event-target-shim "^5.0.0"
|
||||
|
||||
agent-base@5:
|
||||
version "5.1.1"
|
||||
resolved "https://registry.yarnpkg.com/agent-base/-/agent-base-5.1.1.tgz#e8fb3f242959db44d63be665db7a8e739537a32c"
|
||||
@ -137,6 +144,11 @@ escalade@^3.0.2:
|
||||
resolved "https://registry.yarnpkg.com/escalade/-/escalade-3.1.0.tgz#e8e2d7c7a8b76f6ee64c2181d6b8151441602d4e"
|
||||
integrity sha512-mAk+hPSO8fLDkhV7V0dXazH5pDc6MrjBTPyD3VeKzxnVFjH1MIxbCdqGZB9O8+EwWakZs3ZCbDS4IpRt79V1ig==
|
||||
|
||||
event-target-shim@^5.0.0:
|
||||
version "5.0.1"
|
||||
resolved "https://registry.yarnpkg.com/event-target-shim/-/event-target-shim-5.0.1.tgz#5d4d3ebdf9583d63a5333ce2deb7480ab2b05789"
|
||||
integrity sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==
|
||||
|
||||
extract-zip@^2.0.0:
|
||||
version "2.0.1"
|
||||
resolved "https://registry.yarnpkg.com/extract-zip/-/extract-zip-2.0.1.tgz#663dca56fe46df890d5f131ef4a06d22bb8ba13a"
|
||||
@ -257,6 +269,11 @@ ms@2.1.2:
|
||||
resolved "https://registry.yarnpkg.com/ms/-/ms-2.1.2.tgz#d09d1f357b443f493382a8eb3ccd183872ae6009"
|
||||
integrity sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==
|
||||
|
||||
node-fetch@^2.6.1:
|
||||
version "2.6.1"
|
||||
resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.1.tgz#045bd323631f76ed2e2b55573394416b639a0052"
|
||||
integrity sha512-V4aYg89jEoVRxRb2fJdAg8FHvI7cEyYdVAh94HH0UIK8oJxUfkjlDQN9RbMx+bEjP7+ggMiFRprSti032Oipxw==
|
||||
|
||||
once@^1.3.0, once@^1.3.1, once@^1.4.0:
|
||||
version "1.4.0"
|
||||
resolved "https://registry.yarnpkg.com/once/-/once-1.4.0.tgz#583b1aa775961d4b113ac17d9c50baef9dd76bd1"
|
||||
|
Loading…
x
Reference in New Issue
Block a user