mirror of
https://github.com/openzim/zimit.git
synced 2025-09-23 20:18:04 -04:00
crawling: move checking logic to shouldCrawl, remove hashtag before checking seen list
This commit is contained in:
parent
9b23de828b
commit
ff2773677c
56
index.js
56
index.js
@ -3,12 +3,13 @@ const { Cluster } = require("puppeteer-cluster");
|
||||
const child_process = require("child_process");
|
||||
|
||||
async function run(params) {
|
||||
// Chrome Flags, including proxy server
|
||||
const args = [
|
||||
"--no-first-run",
|
||||
"--no-xshm",
|
||||
"--no-xshm", // needed for Chrome >80 (check if puppeteer adds automatically)
|
||||
`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`
|
||||
];
|
||||
|
||||
// Puppeter Options
|
||||
const puppeteerOptions = {
|
||||
headless: true,
|
||||
executablePath: "/usr/bin/google-chrome",
|
||||
@ -16,6 +17,7 @@ async function run(params) {
|
||||
args
|
||||
};
|
||||
|
||||
// Puppeteer Cluster init and options
|
||||
const cluster = await Cluster.launch({
|
||||
concurrency: Cluster.CONCURRENCY_PAGE,
|
||||
maxConcurrency: Number(params.workers) || 1,
|
||||
@ -25,14 +27,22 @@ async function run(params) {
|
||||
monitor: true
|
||||
});
|
||||
|
||||
// Maintain own seen list
|
||||
let seenList = new Set();
|
||||
const url = params._[0];
|
||||
|
||||
let { waitUntil, timeout, scope } = params;
|
||||
|
||||
// waitUntil condition (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options)
|
||||
waitUntil = waitUntil || "load";
|
||||
|
||||
// Timeout per page
|
||||
timeout = timeout || 60000;
|
||||
|
||||
// Scope for crawl, default to the domain of the URL
|
||||
scope = scope || new URL(url).origin + "/";
|
||||
|
||||
// Crawl Task
|
||||
cluster.task(async ({page, data}) => {
|
||||
const {url} = data;
|
||||
|
||||
@ -48,13 +58,10 @@ async function run(params) {
|
||||
});
|
||||
|
||||
for (data of result) {
|
||||
if (seenList.has(data.url)) {
|
||||
continue;
|
||||
}
|
||||
//console.log(`check ${data.url} in ${allowedDomain}`);
|
||||
if (scope && data.url.startsWith(scope)) {
|
||||
seenList.add(data.url);
|
||||
cluster.queue({url: data.url});
|
||||
const newUrl = shouldCrawl(scope, seenList, data.url);
|
||||
if (newUrl) {
|
||||
seenList.add(newUrl);
|
||||
cluster.queue({url: newUrl});
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
@ -81,6 +88,37 @@ async function run(params) {
|
||||
}
|
||||
|
||||
|
||||
function shouldCrawl(scope, seenList, url) {
|
||||
try {
|
||||
url = new URL(url);
|
||||
} catch(e) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// remove hashtag
|
||||
url.hash = "";
|
||||
|
||||
// only queue http/https URLs
|
||||
if (url.protocol != "http:" && url.protocol != "https:") {
|
||||
return false;
|
||||
}
|
||||
|
||||
url = url.href;
|
||||
|
||||
// skip already crawled
|
||||
if (seenList.has(url)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// if scope is provided, skip urls not in scope
|
||||
if (scope && !url.startsWith(scope)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return url;
|
||||
}
|
||||
|
||||
|
||||
async function main() {
|
||||
const params = require('yargs').argv;
|
||||
console.log(params);
|
||||
|
Loading…
x
Reference in New Issue
Block a user