diff --git a/Dockerfile b/Dockerfile index faf9bea..013a42c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,12 +15,14 @@ ENV PROXY_HOST=localhost \ RUN pip install gevent>=20.9.0 uwsgi -RUN pip install warc2zim==1.2.0 +#RUN pip install git+https://github.com/openzim/warc2zim@fuzzy-match +RUN pip install 'warc2zim>=1.3.0' RUN pip install git+https://github.com/webrecorder/pywb@patch-work COPY --from=chrome /usr/lib/x86_64-linux-gnu/ /usr/lib/x86_64-linux-gnu/ COPY --from=chrome /lib/x86_64-linux-gnu/libdbus* /lib/x86_64-linux-gnu/ +COPY --from=chrome /opt/google/chrome/ /opt/google/chrome/ WORKDIR /app @@ -32,6 +34,8 @@ ADD config.yaml /app/ ADD uwsgi.ini /app/ ADD zimit.py /app/ ADD crawler.js /app/ +ADD autoplay.js /app/ + RUN ln -s /app/zimit.py /usr/bin/zimit CMD ["zimit"] diff --git a/autoplay.js b/autoplay.js new file mode 100644 index 0000000..d36b958 --- /dev/null +++ b/autoplay.js @@ -0,0 +1,89 @@ +(() => { + + function run() { + if (window.navigator.__crawler_autoplay) { + return; + } + + //console.log("checking autoplay for " + document.location.href); + window.navigator.__crawler_autoplay = true; + + const specialActions = [ + { + rx: /w\.soundcloud\.com/, + check(url) { + const autoplay = url.searchParams.get('auto_play'); + return autoplay === 'true'; + }, + handle(url) { + url.searchParams.set('auto_play', 'true'); + // set continuous_play to true in order to handle + // a playlist etc + url.searchParams.set('continuous_play', 'true'); + self.location.href = url.href; + }, + }, + { + rx: [/player\.vimeo\.com/, /youtube\.com\/embed\//], + check(url) { + const autoplay = url.searchParams.get('autoplay'); + return autoplay === '1'; + }, + handle(url) { + url.searchParams.set('autoplay', '1'); + if (window.__crawler_autoplayLoad) { + window.__crawler_autoplayLoad(url.href); + } + self.location.href = url.href; + }, + }, + ]; + const url = new URL(self.location.href); + for (let i = 0; i < specialActions.length; i++) { + if (Array.isArray(specialActions[i].rx)) { + const rxs = specialActions[i].rx; + for (let j = 0; j < rxs.length; j++) { + if (url.href.search(rxs[j]) >= 0) { + if (specialActions[i].check(url)) return; + return specialActions[i].handle(url); + } + } + } else if (url.href.search(specialActions[i].rx) >= 0) { + if (specialActions[i].check(url)) return; + return specialActions[i].handle(url); + } + } + } + + document.addEventListener("readystatechange", run); + + if (document.readyState === "complete") { + run(); + } + + + const mediaSet = new Set(); + + setInterval(() => { + const medias = document.querySelectorAll("video, audio"); + + for (const media of medias) { + try { + if (media.src && !mediaSet.has(media.src)) { + if (window.__crawler_queueUrls && (media.src.startsWith("http:") || media.src.startsWith("https:"))) { + window.__crawler_queueUrls(media.src); + } + mediaSet.add(media.src); + } else if (!media.src) { + media.play(); + } + } catch(e) { + console.log(e); + } + } + }, 3000); + + + +})(); + diff --git a/crawler.js b/crawler.js index 9425538..4808f4c 100644 --- a/crawler.js +++ b/crawler.js @@ -1,4 +1,5 @@ -const puppeteer = require("puppeteer"); +const fs = require("fs"); +const puppeteer = require("puppeteer-core"); const { Cluster } = require("puppeteer-cluster"); const child_process = require("child_process"); const fetch = require("node-fetch"); @@ -6,6 +7,7 @@ const AbortController = require("abort-controller"); const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"]; const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"]; +const NEW_CONTEXT_OPTS = ["page", "session", "browser"]; const CHROME_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"; // to ignore HTTPS error for HEAD check @@ -24,30 +26,58 @@ process.once('SIGTERM', (code) => { }); +const autoplayScript = fs.readFileSync("./autoplay.js", "utf-8"); + + +// prefix for direct capture via pywb +const capturePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/capture/record/id_/`; +const headers = {"User-Agent": CHROME_USER_AGENT}; + + async function run(params) { // Chrome Flags, including proxy server const args = [ "--no-xshm", // needed for Chrome >80 (check if puppeteer adds automatically) `--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`, - "--no-sandbox" + "--no-sandbox", + "--disable-background-media-suspend", + "--autoplay-policy=no-user-gesture-required", ]; - // prefix for direct capture via pywb - const capturePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/capture/record/id_/`; - // Puppeter Options const puppeteerOptions = { headless: true, - //executablePath: "/usr/bin/google-chrome", + executablePath: "/opt/google/chrome/google-chrome", ignoreHTTPSErrors: true, args }; + // params + const { url, waitUntil, timeout, scope, limit, exclude, scroll, newContext } = params; + + let concurrency = Cluster.CONCURRENCY_PAGE; + + switch (newContext) { + case "page": + concurrency = Cluster.CONCURRENCY_PAGE; + break; + + case "session": + concurrency = Cluster.CONCURRENCY_CONTEXT; + break; + + case "browser": + concurrency = Cluster.CONCURRENCY_BROWSER; + break; + } + // Puppeteer Cluster init and options const cluster = await Cluster.launch({ - concurrency: Cluster.CONCURRENCY_PAGE, + concurrency, maxConcurrency: Number(params.workers) || 1, skipDuplicateUrls: true, + // total timeout for cluster + timeout: timeout * 2, puppeteerOptions, puppeteer, monitor: true @@ -56,9 +86,6 @@ async function run(params) { // Maintain own seen list const seenList = new Set(); - // params - const { url, waitUntil, timeout, scope, limit, exclude, scroll } = params; - //console.log("Limit: " + limit); // links crawled counter @@ -72,12 +99,46 @@ async function run(params) { return; } + //page.on('console', message => console.log(`${message.type()} ${message.text()}`)); + //page.on('pageerror', message => console.warn(message)); + //page.on('error', message => console.warn(message)); + //page.on('requestfailed', message => console.warn(message._failureText)); + const mediaResults = []; + + await page.exposeFunction('__crawler_queueUrls', (url) => { + mediaResults.push(directCapture(url)); + }); + + let waitForVideo = false; + + await page.exposeFunction('__crawler_autoplayLoad', (url) => { + console.log("*** Loading autoplay URL: " + url); + waitForVideo = true; + }); + + try { + await page.evaluateOnNewDocument(autoplayScript); + } catch(e) { + console.log(e); + } + try { await page.goto(url, {waitUntil, timeout}); } catch (e) { console.log(`Load timeout for ${url}`); } + try { + await Promise.all(mediaResults); + } catch (e) { + console.log(`Error loading media URLs`, e); + } + + if (waitForVideo) { + console.log("Extra wait 15s for video loading"); + await sleep(15000); + } + if (scroll) { try { await Promise.race([page.evaluate(autoScroll), sleep(30000)]); @@ -148,8 +209,18 @@ function shouldCrawl(scope, seenList, url, exclude) { return false; } - // if scope is provided, skip urls not in scope - if (scope && !url.startsWith(scope)) { + let inScope = false; + + // check scopes + for (const s of scope) { + if (s.exec(url)) { + inScope = true; + break; + } + } + + if (!inScope) { + //console.log(`Not in scope ${url} ${scope}`); return false; } @@ -166,8 +237,6 @@ function shouldCrawl(scope, seenList, url, exclude) { async function htmlCheck(url, capturePrefix) { try { - const headers = {"User-Agent": CHROME_USER_AGENT}; - const agent = url.startsWith("https:") ? HTTPS_AGENT : null; const resp = await fetch(url, {method: "HEAD", headers, agent}); @@ -191,11 +260,7 @@ async function htmlCheck(url, capturePrefix) { } // capture directly - console.log(`Direct capture: ${capturePrefix}${url}`); - const abort = new AbortController(); - const signal = abort.signal; - const resp2 = await fetch(capturePrefix + url, {signal, headers}); - abort.abort(); + await directCapture(url); return false; } catch(e) { @@ -205,6 +270,15 @@ async function htmlCheck(url, capturePrefix) { } } +async function directCapture(url) { + console.log(`Direct capture: ${capturePrefix}${url}`); + const abort = new AbortController(); + const signal = abort.signal; + const resp2 = await fetch(capturePrefix + url, {signal, headers}); + abort.abort(); +} + + async function autoScroll() { const canScrollMore = () => @@ -249,6 +323,12 @@ async function main() { type: "number", }, + "newContext": { + describe: "The context for each new capture, can be a new: page, session or browser.", + default: "page", + type: "string" + }, + "waitUntil": { describe: "Puppeteer page.goto() condition to wait for before continuing", default: "load", @@ -267,11 +347,11 @@ async function main() { }, "scope": { - describe: "The scope of current page that should be included in the crawl (defaults to the immediate directory of URL)", + describe: "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)", }, "exclude": { - describe: "Regex of URLs that should be excluded from the crawl." + describe: "Regex of page URLs that should be excluded from the crawl." }, "scroll": { @@ -291,7 +371,8 @@ async function main() { argv.url = url.href; if (!argv.scope) { - argv.scope = url.href.slice(0, url.href.lastIndexOf("/") + 1); + //argv.scope = url.href.slice(0, url.href.lastIndexOf("/") + 1); + argv.scope = [new RegExp("^" + rxEscape(url.href.slice(0, url.href.lastIndexOf("/") + 1)))]; } argv.timeout *= 1000; @@ -302,6 +383,11 @@ async function main() { throw new Error("Invalid waitUntil, must be one of: " + WAIT_UNTIL_OPTS.join(",")); } + if (!NEW_CONTEXT_OPTS.includes(argv.newContext)) { + throw new Error("Invalid newContext, must be one of: " + NEW_CONTEXT_OPTS.join(",")); + } + + // Support one or multiple exclude if (argv.exclude) { if (typeof(argv.exclude) === "string") { argv.exclude = [new RegExp(argv.exclude)]; @@ -312,11 +398,23 @@ async function main() { argv.exclude = []; } + // Support one or multiple scopes + if (argv.scope) { + if (typeof(argv.scope) === "string") { + argv.scope = [new RegExp(argv.scope)]; + } else { + argv.scope = argv.scope.map(e => new RegExp(e)); + } + } else { + argv.scope = []; + } + return true; }) .argv; console.log("Exclusions Regexes: ", params.exclude); + console.log("Scope Regexes: ", params.scope); try { await run(params); @@ -328,6 +426,11 @@ async function main() { } } +function rxEscape(string) { + return string.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&'); +} + + main(); diff --git a/package.json b/package.json index e991921..3820178 100644 --- a/package.json +++ b/package.json @@ -8,8 +8,8 @@ "dependencies": { "abort-controller": "^3.0.0", "node-fetch": "^2.6.1", - "puppeteer": "^5.3.0", "puppeteer-cluster": "^0.22.0", + "puppeteer-core": "^5.3.1", "yargs": "^16.0.3" } } diff --git a/yarn.lock b/yarn.lock index 458e423..91ceab7 100644 --- a/yarn.lock +++ b/yarn.lock @@ -8,9 +8,9 @@ integrity sha512-rr+OQyAjxze7GgWrSaJwydHStIhHq2lvY3BOC2Mj7KnzI7XK0Uw1TOOdI9lDoajEbSWLiYgoo4f1R51erQfhPQ== "@types/node@*": - version "14.11.1" - resolved "https://registry.yarnpkg.com/@types/node/-/node-14.11.1.tgz#56af902ad157e763f9ba63d671c39cda3193c835" - integrity sha512-oTQgnd0hblfLsJ6BvJzzSL+Inogp3lq9fGgqRkMB/ziKMgEUaFl801OncOzUmalfzt14N0oPHMK47ipl+wbTIw== + version "14.14.0" + resolved "https://registry.yarnpkg.com/@types/node/-/node-14.14.0.tgz#f1091b6ad5de18e8e91bdbd43ec63f13de372538" + integrity sha512-BfbIHP9IapdupGhq/hc+jT5dyiBVZ2DdeC5WwJWQWDb0GijQlzUFAeIQn/2GtvZcd2HVUU7An8felIICFTC2qg== "@types/yauzl@^2.9.1": version "2.9.1" @@ -247,11 +247,6 @@ locate-path@^5.0.0: dependencies: p-locate "^4.1.0" -mime@^2.0.3: - version "2.4.6" - resolved "https://registry.yarnpkg.com/mime/-/mime-2.4.6.tgz#e5b407c90db442f2beb5b162373d07b69affa4d1" - integrity sha512-RZKhC3EmpBchfTGBVb8fb+RL2cWyw/32lshnsETttkBAyAUXSGHxbEJWWRXc751DrIxG1q04b8QwMbAwkRPpUA== - minimatch@^3.0.4: version "3.0.4" resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-3.0.4.tgz#5166e286457f03306064be5497e8dbb0c3d32083" @@ -347,16 +342,15 @@ puppeteer-cluster@^0.22.0: dependencies: debug "^4.1.1" -puppeteer@^5.3.0: - version "5.3.0" - resolved "https://registry.yarnpkg.com/puppeteer/-/puppeteer-5.3.0.tgz#0abf83d0f2d1273baf2b56885a813f8052903e33" - integrity sha512-GjqMk5GRro3TO0sw3QMsF1H7n+/jaK2OW45qMvqjYUyJ7y4oA//9auy969HHhTG3HZXaMxY/NWXF/NXlAFIvtw== +puppeteer-core@^5.3.1: + version "5.3.1" + resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-5.3.1.tgz#1affb1738afac499416a7fd4ed2ed0c18577e88f" + integrity sha512-YE6c6FvHAFKQUyNTqFs78SgGmpcqOPhhmVfEVNYB4abv7bV2V+B3r72T3e7vlJkEeTloy4x9bQLrGbHHoKSg1w== dependencies: debug "^4.1.0" devtools-protocol "0.0.799653" extract-zip "^2.0.0" https-proxy-agent "^4.0.0" - mime "^2.0.3" pkg-dir "^4.2.0" progress "^2.0.1" proxy-from-env "^1.0.0" diff --git a/zimit.py b/zimit.py index 30cec80..9272d5d 100755 --- a/zimit.py +++ b/zimit.py @@ -31,6 +31,13 @@ def zimit(args=None): parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers") + parser.add_argument( + "--newContext", + help="The context for each new capture, can be a new: page, session or browser.", + choices=["page", "session", "browser"], + default="page", + ) + parser.add_argument( "--waitUntil", help="Puppeteer page.goto() condition to wait for before continuing", @@ -51,11 +58,11 @@ def zimit(args=None): parser.add_argument( "--scope", - help="The scope of current page that should be included in the crawl (defaults to the immediate directory of the URL)", + help="Regex of page URLs that should be included in the crawl (defaults to the immediate directory of the URL)", ) parser.add_argument( - "--exclude", help="Regex of URLs that should be excluded from the crawl." + "--exclude", help="Regex of page URLs that should be excluded from the crawl." ) parser.add_argument( @@ -150,6 +157,7 @@ def get_node_cmd_line(args): for arg in [ "url", "workers", + "newContext", "waitUntil", "limit", "timeout",