mirror of
https://github.com/openzim/zimit.git
synced 2025-09-22 11:22:23 -04:00
add regex exclusions
This commit is contained in:
parent
f252245983
commit
f25b390f15
@ -31,6 +31,7 @@ The image accepts the following parameters:
|
||||
- `--name` - Name of ZIM file (defaults to the hostname of the URL)
|
||||
- `--output` - output directory (defaults to `/output`)
|
||||
- `--limit U` - Limit capture to at most U URLs
|
||||
- `--exclude <regex>` - skip URLs that match the regex from crawling. Can be specified multiple times.
|
||||
|
||||
The following is an example usage. The `--cap-add` and `--shm-size` flags are needed to run Chrome in Docker.
|
||||
|
||||
|
22
index.js
22
index.js
@ -39,7 +39,7 @@ async function run(params) {
|
||||
let seenList = new Set();
|
||||
const url = params._[0];
|
||||
|
||||
let { waitUntil, timeout, scope, limit } = params;
|
||||
let { waitUntil, timeout, scope, limit, exclude } = params;
|
||||
|
||||
// waitUntil condition (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options)
|
||||
waitUntil = waitUntil || "load";
|
||||
@ -53,6 +53,14 @@ async function run(params) {
|
||||
// Limit number of pages captured
|
||||
limit = Number(limit) || 0;
|
||||
|
||||
if (exclude) {
|
||||
if (typeof(exclude) === "string") {
|
||||
exclude = [new RegExp(exclude)];
|
||||
} else {
|
||||
exclude = exclude.map(e => new RegExp(e));
|
||||
}
|
||||
}
|
||||
|
||||
console.log("Limit: " + limit);
|
||||
|
||||
// links crawled counter
|
||||
@ -85,7 +93,7 @@ async function run(params) {
|
||||
|
||||
try {
|
||||
for (data of results) {
|
||||
const newUrl = shouldCrawl(scope, seenList, data.url);
|
||||
const newUrl = shouldCrawl(scope, seenList, data.url, exclude);
|
||||
|
||||
if (newUrl) {
|
||||
seenList.add(newUrl);
|
||||
@ -119,7 +127,7 @@ async function run(params) {
|
||||
}
|
||||
|
||||
|
||||
function shouldCrawl(scope, seenList, url) {
|
||||
function shouldCrawl(scope, seenList, url, exclude) {
|
||||
try {
|
||||
url = new URL(url);
|
||||
} catch(e) {
|
||||
@ -146,6 +154,14 @@ function shouldCrawl(scope, seenList, url) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// check exclusions
|
||||
for (const e of exclude) {
|
||||
if (e.exec(url)) {
|
||||
//console.log(`Skipping ${url} excluded by ${e}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return url;
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user