From b00c4262a7af9c88178a1c22a30dd9b0bd0e320f Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@gmail.com>
Date: Mon, 21 Sep 2020 07:14:23 +0000
Subject: [PATCH] add --limit param for max URLs to be captured add 'html
 check', only load HTML in browsers, load other content-types directly via
 pywb, esp for PDFs (work on #8) improved error handling

---
 Dockerfile   |  4 +--
 README.md    |  1 +
 index.js     | 87 ++++++++++++++++++++++++++++++++++++++++++++++------
 package.json |  2 ++
 run.sh       |  8 +++--
 yarn.lock    | 17 ++++++++++
 6 files changed, 105 insertions(+), 14 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 2a9360d..af6955c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -28,18 +28,18 @@ RUN useradd zimit --shell /bin/bash --create-home \
 
 WORKDIR /app
 
-ADD index.js /app/
 ADD package.json /app/
 
 RUN chown -R zimit /app
 
-USER zimit
+#USER zimit
 
 RUN yarn install
 
 ADD config.yaml /app/
 ADD uwsgi.ini /app/
 ADD run.sh /app/
+ADD index.js /app/
 
 ENTRYPOINT ["/app/run.sh"]
 
diff --git a/README.md b/README.md
index 0823579..5a23eff 100644
--- a/README.md
+++ b/README.md
@@ -30,6 +30,7 @@ The image accepts the following parameters:
 - `--wait-until` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example).
 - `--name` - Name of ZIM file (defaults to the hostname of the URL)
 - `--output` - output directory (defaults to `/output`)
+- `--limit U` - Limit capture to at most U URLs
 
 The following is an example usage. The `--cap-add` and `--shm-size` flags are needed to run Chrome in Docker.
 
diff --git a/index.js b/index.js
index a5a2103..d43eddd 100644
--- a/index.js
+++ b/index.js
@@ -1,6 +1,11 @@
 const puppeteer = require("puppeteer-core");
 const { Cluster } = require("puppeteer-cluster");
 const child_process = require("child_process");
+const fetch = require("node-fetch");
+const AbortController = require("abort-controller");
+
+const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
+
 
 async function run(params) {
   // Chrome Flags, including proxy server
@@ -9,6 +14,9 @@ async function run(params) {
     `--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`
   ];
 
+  // prefix for direct capture via pywb
+  const capturePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/capture/record/id_/`;
+
   // Puppeter Options
   const puppeteerOptions = {
     headless: true,
@@ -19,7 +27,7 @@ async function run(params) {
 
   // Puppeteer Cluster init and options
   const cluster = await Cluster.launch({
-    concurrency: Cluster.CONCURRENCY_PAGE,
+    concurrency: Cluster.CONCURRENCY_CONTEXT,
     maxConcurrency: Number(params.workers) || 1,
     skipDuplicateUrls: true,
     puppeteerOptions,
@@ -31,45 +39,68 @@ async function run(params) {
   let seenList = new Set();
   const url = params._[0];
 
-  let { waitUntil, timeout, scope } = params;
+  let { waitUntil, timeout, scope, limit } = params;
 
   // waitUntil condition (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options)
   waitUntil = waitUntil || "load";
 
   // Timeout per page
-  timeout = timeout || 60000;
+  timeout = Number(timeout) || 60000;
 
   // Scope for crawl, default to the domain of the URL
   scope = scope || new URL(url).origin + "/";
 
+  // Limit number of pages captured
+  limit = Number(limit) || 0;
+
+  console.log("Limit: " + limit);
+
+  // links crawled counter
+  let numLinks = 0;
+
   // Crawl Task
   cluster.task(async ({page, data}) => {
     const {url} = data;
 
+    if (!await htmlCheck(url, capturePrefix)) {
+      return;
+    }
+
     try {
       await page.goto(url, {waitUntil, timeout});
     } catch (e) {
       console.log(`Load timeout for ${url}`);
     }
 
-    try{
-      const result = await page.evaluate(() => {
+    let results = null;
+
+    try {
+      results = await page.evaluate(() => {
         return [...document.querySelectorAll('a[href]')].map(el => ({ url: el.href}))
       });
+    } catch (e) {
+      console.warn("Link Extraction failed", e);
+      return;
+    }
 
-      for (data of result) {
+    try {
+      for (data of results) {
         const newUrl = shouldCrawl(scope, seenList, data.url);
+
         if (newUrl) {
           seenList.add(newUrl);
+          if (numLinks++ >= limit && limit > 0) {
+            break;
+          }
           cluster.queue({url: newUrl});
         }
       }
     } catch (e) {
-      console.warn("error");
-      console.warn(e);
+      console.log("Queuing Error: " + e);
     }
   });
 
+  numLinks++;
   cluster.queue({url});
 
   await cluster.idle();
@@ -118,6 +149,43 @@ function shouldCrawl(scope, seenList, url) {
   return url;
 }
 
+async function htmlCheck(url, capturePrefix) {
+  try {
+    const resp = await fetch(url, {method: "HEAD"});
+
+    if (resp.status >= 400) {
+      console.log(`Skipping ${url}, invalid status ${resp.status}`);
+      return false;
+    }
+
+    const contentType = resp.headers.get("Content-Type");
+
+    // just load if no content-type
+    if (!contentType) {
+      return true;
+    }
+
+    const mime = contentType.split(";")[0];
+
+    if (HTML_TYPES.includes(mime)) {
+      return true;
+    }
+
+    // capture directly
+    console.log(`Direct capture: ${capturePrefix}${url}`);
+    const abort = new AbortController();
+    const signal = abort.signal;
+    const resp2 = await fetch(capturePrefix + url, {signal});
+    abort.abort();
+
+    return false;
+  } catch(e) {
+    console.log("HTML Check error", e);
+    // can't confirm not html, so try in browser
+    return true;
+  }
+}
+
 
 async function main() {
   const params = require('yargs').argv;
@@ -127,7 +195,8 @@ async function main() {
     await run(params);
     process.exit(0);
   } catch(e) {
-    console.log(e);
+    console.error("Crawl failed, ZIM creation skipped");
+    console.error(e);
     process.exit(1);
   }
 }
diff --git a/package.json b/package.json
index 93e01f5..c304660 100644
--- a/package.json
+++ b/package.json
@@ -6,6 +6,8 @@
   "author": "Ilya Kreymer <ikreymer@gmail.com>",
   "license": "MIT",
   "dependencies": {
+    "abort-controller": "^3.0.0",
+    "node-fetch": "^2.6.1",
     "puppeteer-cluster": "^0.22.0",
     "puppeteer-core": "^5.3.0",
     "yargs": "^16.0.3"
diff --git a/run.sh b/run.sh
index b9bd9fc..74d0752 100755
--- a/run.sh
+++ b/run.sh
@@ -1,10 +1,12 @@
 #!/bin/bash
-URL="$1"
-
 wb-manager init capture
 uwsgi uwsgi.ini &> /dev/null &
 
 # needed for chrome
 export QT_X11_NO_MITSHM=1
 
-node index.js "$@"
+cmd="$@"
+
+su zimit -c "node index.js $cmd"
+
+
diff --git a/yarn.lock b/yarn.lock
index 039a6f7..d793a9a 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -19,6 +19,13 @@
   dependencies:
     "@types/node" "*"
 
+abort-controller@^3.0.0:
+  version "3.0.0"
+  resolved "https://registry.yarnpkg.com/abort-controller/-/abort-controller-3.0.0.tgz#eaf54d53b62bae4138e809ca225c8439a6efb392"
+  integrity sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==
+  dependencies:
+    event-target-shim "^5.0.0"
+
 agent-base@5:
   version "5.1.1"
   resolved "https://registry.yarnpkg.com/agent-base/-/agent-base-5.1.1.tgz#e8fb3f242959db44d63be665db7a8e739537a32c"
@@ -137,6 +144,11 @@ escalade@^3.0.2:
   resolved "https://registry.yarnpkg.com/escalade/-/escalade-3.1.0.tgz#e8e2d7c7a8b76f6ee64c2181d6b8151441602d4e"
   integrity sha512-mAk+hPSO8fLDkhV7V0dXazH5pDc6MrjBTPyD3VeKzxnVFjH1MIxbCdqGZB9O8+EwWakZs3ZCbDS4IpRt79V1ig==
 
+event-target-shim@^5.0.0:
+  version "5.0.1"
+  resolved "https://registry.yarnpkg.com/event-target-shim/-/event-target-shim-5.0.1.tgz#5d4d3ebdf9583d63a5333ce2deb7480ab2b05789"
+  integrity sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==
+
 extract-zip@^2.0.0:
   version "2.0.1"
   resolved "https://registry.yarnpkg.com/extract-zip/-/extract-zip-2.0.1.tgz#663dca56fe46df890d5f131ef4a06d22bb8ba13a"
@@ -257,6 +269,11 @@ ms@2.1.2:
   resolved "https://registry.yarnpkg.com/ms/-/ms-2.1.2.tgz#d09d1f357b443f493382a8eb3ccd183872ae6009"
   integrity sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==
 
+node-fetch@^2.6.1:
+  version "2.6.1"
+  resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.1.tgz#045bd323631f76ed2e2b55573394416b639a0052"
+  integrity sha512-V4aYg89jEoVRxRb2fJdAg8FHvI7cEyYdVAh94HH0UIK8oJxUfkjlDQN9RbMx+bEjP7+ggMiFRprSti032Oipxw==
+
 once@^1.3.0, once@^1.3.1, once@^1.4.0:
   version "1.4.0"
   resolved "https://registry.yarnpkg.com/once/-/once-1.4.0.tgz#583b1aa775961d4b113ac17d9c50baef9dd76bd1"