add regex exclusions

This commit is contained in:
Ilya Kreymer 2020-09-22 17:48:09 +00:00
parent f252245983
commit f25b390f15
2 changed files with 20 additions and 3 deletions

View File

@ -31,6 +31,7 @@ The image accepts the following parameters:
- `--name` - Name of ZIM file (defaults to the hostname of the URL) - `--name` - Name of ZIM file (defaults to the hostname of the URL)
- `--output` - output directory (defaults to `/output`) - `--output` - output directory (defaults to `/output`)
- `--limit U` - Limit capture to at most U URLs - `--limit U` - Limit capture to at most U URLs
- `--exclude <regex>` - skip URLs that match the regex from crawling. Can be specified multiple times.
The following is an example usage. The `--cap-add` and `--shm-size` flags are needed to run Chrome in Docker. The following is an example usage. The `--cap-add` and `--shm-size` flags are needed to run Chrome in Docker.

View File

@ -39,7 +39,7 @@ async function run(params) {
let seenList = new Set(); let seenList = new Set();
const url = params._[0]; const url = params._[0];
let { waitUntil, timeout, scope, limit } = params; let { waitUntil, timeout, scope, limit, exclude } = params;
// waitUntil condition (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options) // waitUntil condition (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options)
waitUntil = waitUntil || "load"; waitUntil = waitUntil || "load";
@ -53,6 +53,14 @@ async function run(params) {
// Limit number of pages captured // Limit number of pages captured
limit = Number(limit) || 0; limit = Number(limit) || 0;
if (exclude) {
if (typeof(exclude) === "string") {
exclude = [new RegExp(exclude)];
} else {
exclude = exclude.map(e => new RegExp(e));
}
}
console.log("Limit: " + limit); console.log("Limit: " + limit);
// links crawled counter // links crawled counter
@ -85,7 +93,7 @@ async function run(params) {
try { try {
for (data of results) { for (data of results) {
const newUrl = shouldCrawl(scope, seenList, data.url); const newUrl = shouldCrawl(scope, seenList, data.url, exclude);
if (newUrl) { if (newUrl) {
seenList.add(newUrl); seenList.add(newUrl);
@ -119,7 +127,7 @@ async function run(params) {
} }
function shouldCrawl(scope, seenList, url) { function shouldCrawl(scope, seenList, url, exclude) {
try { try {
url = new URL(url); url = new URL(url);
} catch(e) { } catch(e) {
@ -146,6 +154,14 @@ function shouldCrawl(scope, seenList, url) {
return false; return false;
} }
// check exclusions
for (const e of exclude) {
if (e.exec(url)) {
//console.log(`Skipping ${url} excluded by ${e}`);
return false;
}
}
return url; return url;
} }