crawling: move checking logic to shouldCrawl, remove hashtag before checking seen list

2025-09-24 04:30:11 -04:00 · 2020-09-19 23:18:15 +00:00 · 2020-09-19 23:18:15 +00:00 · ff2773677c
commit ff2773677c
parent 9b23de828b
1 changed files with 47 additions and 9 deletions
--- a/index.js
+++ b/index.js
@ -3,12 +3,13 @@ const { Cluster } = require("puppeteer-cluster");
 const child_process = require("child_process");

 async function run(params) {
+  // Chrome Flags, including proxy server
  const args = [
-    "--no-first-run",
-    "--no-xshm",
+    "--no-xshm", // needed for Chrome >80 (check if puppeteer adds automatically)
    `--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`
  ];

+  // Puppeter Options
  const puppeteerOptions = {
    headless: true,
    executablePath: "/usr/bin/google-chrome",
@ -16,6 +17,7 @@ async function run(params) {
    args
  };

+  // Puppeteer Cluster init and options
  const cluster = await Cluster.launch({
    concurrency: Cluster.CONCURRENCY_PAGE,
    maxConcurrency: Number(params.workers) || 1,
@ -25,14 +27,22 @@ async function run(params) {
    monitor: true
  });

+  // Maintain own seen list
  let seenList = new Set();
  const url = params._[0];

  let { waitUntil, timeout, scope } = params;
+
+  // waitUntil condition (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options)
  waitUntil = waitUntil || "load";
+
+  // Timeout per page
  timeout = timeout || 60000;
+
+  // Scope for crawl, default to the domain of the URL
  scope = scope || new URL(url).origin + "/";

+  // Crawl Task
  cluster.task(async ({page, data}) => {
    const {url} = data;

@ -48,13 +58,10 @@ async function run(params) {
      });

      for (data of result) {
-        if (seenList.has(data.url)) {
-          continue;
-        }
-        //console.log(`check ${data.url} in ${allowedDomain}`);
-        if (scope && data.url.startsWith(scope)) {
-          seenList.add(data.url);
-          cluster.queue({url: data.url});
+        const newUrl = shouldCrawl(scope, seenList, data.url);
+        if (newUrl) {
+          seenList.add(newUrl);
+          cluster.queue({url: newUrl});
        }
      }
    } catch (e) {
@ -81,6 +88,37 @@ async function run(params) {
 }


+function shouldCrawl(scope, seenList, url) {
+  try {
+    url = new URL(url);
+  } catch(e) {
+    return false;
+  }
+
+  // remove hashtag
+  url.hash = "";
+
+  // only queue http/https URLs
+  if (url.protocol != "http:" && url.protocol != "https:") {
+    return false;
+  }
+
+  url = url.href;
+
+  // skip already crawled
+  if (seenList.has(url)) {
+    return false;
+  }
+
+  // if scope is provided, skip urls not in scope
+  if (scope && !url.startsWith(scope)) {
+    return false;
+  }
+
+  return url;
+}
+
+
 async function main() {
  const params = require('yargs').argv;
  console.log(params);