diff --git a/README.md b/README.md index 5a23eff..f42d6df 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,7 @@ The image accepts the following parameters: - `--name` - Name of ZIM file (defaults to the hostname of the URL) - `--output` - output directory (defaults to `/output`) - `--limit U` - Limit capture to at most U URLs +- `--exclude ` - skip URLs that match the regex from crawling. Can be specified multiple times. The following is an example usage. The `--cap-add` and `--shm-size` flags are needed to run Chrome in Docker. diff --git a/index.js b/index.js index 812f6ae..9f9bbb4 100644 --- a/index.js +++ b/index.js @@ -39,7 +39,7 @@ async function run(params) { let seenList = new Set(); const url = params._[0]; - let { waitUntil, timeout, scope, limit } = params; + let { waitUntil, timeout, scope, limit, exclude } = params; // waitUntil condition (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options) waitUntil = waitUntil || "load"; @@ -53,6 +53,14 @@ async function run(params) { // Limit number of pages captured limit = Number(limit) || 0; + if (exclude) { + if (typeof(exclude) === "string") { + exclude = [new RegExp(exclude)]; + } else { + exclude = exclude.map(e => new RegExp(e)); + } + } + console.log("Limit: " + limit); // links crawled counter @@ -85,7 +93,7 @@ async function run(params) { try { for (data of results) { - const newUrl = shouldCrawl(scope, seenList, data.url); + const newUrl = shouldCrawl(scope, seenList, data.url, exclude); if (newUrl) { seenList.add(newUrl); @@ -119,7 +127,7 @@ async function run(params) { } -function shouldCrawl(scope, seenList, url) { +function shouldCrawl(scope, seenList, url, exclude) { try { url = new URL(url); } catch(e) { @@ -146,6 +154,14 @@ function shouldCrawl(scope, seenList, url) { return false; } + // check exclusions + for (const e of exclude) { + if (e.exec(url)) { + //console.log(`Skipping ${url} excluded by ${e}`); + return false; + } + } + return url; }