work on automated capture of vidoe (#9)

- add autoplay behavior to reload known video sites to autoplay
- for video/audio on page, queue directly for loading if video.src or audio.src set to valid url, otherwise load through play in browser (may be slower)
- add extra wait if reloading for autoplay
- timeouts: set timeout for puppeteer-cluster double to timeout of page to avoid hitting that timeout during regular operation
- use browser from oldwebtoday/chrome:84 and puppeteer-core instead of puppeteer browser for consistent results
- temp testing: use custom wabac.js sw for testing (will use default from warc2zim), using warc2zim fuzzy-match branch for now
This commit is contained in:
Ilya Kreymer 2020-10-21 06:09:10 +00:00
parent c6f27f3bf6
commit 91fe76c56e
7 changed files with 203 additions and 31 deletions

View File

@ -15,12 +15,13 @@ ENV PROXY_HOST=localhost \
RUN pip install gevent>=20.9.0 uwsgi
RUN pip install warc2zim==1.2.0
RUN pip install git+https://github.com/openzim/warc2zim@fuzzy-match
RUN pip install git+https://github.com/webrecorder/pywb@patch-work
COPY --from=chrome /usr/lib/x86_64-linux-gnu/ /usr/lib/x86_64-linux-gnu/
COPY --from=chrome /lib/x86_64-linux-gnu/libdbus* /lib/x86_64-linux-gnu/
COPY --from=chrome /opt/google/chrome/ /opt/google/chrome/
WORKDIR /app
@ -32,6 +33,9 @@ ADD config.yaml /app/
ADD uwsgi.ini /app/
ADD zimit.py /app/
ADD crawler.js /app/
ADD autoplay.js /app/
ADD sw.js /app/
RUN ln -s /app/zimit.py /usr/bin/zimit
CMD ["zimit"]

89
autoplay.js Normal file
View File

@ -0,0 +1,89 @@
(() => {
function run() {
if (window.navigator.__crawler_autoplay) {
return;
}
//console.log("checking autoplay for " + document.location.href);
window.navigator.__crawler_autoplay = true;
const specialActions = [
{
rx: /w\.soundcloud\.com/,
check(url) {
const autoplay = url.searchParams.get('auto_play');
return autoplay === 'true';
},
handle(url) {
url.searchParams.set('auto_play', 'true');
// set continuous_play to true in order to handle
// a playlist etc
url.searchParams.set('continuous_play', 'true');
self.location.href = url.href;
},
},
{
rx: [/player\.vimeo\.com/, /youtube\.com\/embed\//],
check(url) {
const autoplay = url.searchParams.get('autoplay');
return autoplay === '1';
},
handle(url) {
url.searchParams.set('autoplay', '1');
if (window.__crawler_autoplayLoad) {
window.__crawler_autoplayLoad(url.href);
}
self.location.href = url.href;
},
},
];
const url = new URL(self.location.href);
for (let i = 0; i < specialActions.length; i++) {
if (Array.isArray(specialActions[i].rx)) {
const rxs = specialActions[i].rx;
for (let j = 0; j < rxs.length; j++) {
if (url.href.search(rxs[j]) >= 0) {
if (specialActions[i].check(url)) return;
return specialActions[i].handle(url);
}
}
} else if (url.href.search(specialActions[i].rx) >= 0) {
if (specialActions[i].check(url)) return;
return specialActions[i].handle(url);
}
}
}
document.addEventListener("readystatechange", run);
if (document.readyState === "complete") {
run();
}
const mediaSet = new Set();
setInterval(() => {
const medias = document.querySelectorAll("video, audio");
for (const media of medias) {
try {
if (media.src && !mediaSet.has(media.src)) {
if (window.__crawler_queueUrls && (media.src.startsWith("http:") || media.src.startsWith("https:"))) {
window.__crawler_queueUrls(media.src);
}
mediaSet.add(media.src);
} else if (!media.src) {
media.play();
}
} catch(e) {
console.log(e);
}
}
}, 3000);
})();

View File

@ -1,4 +1,5 @@
const puppeteer = require("puppeteer");
const fs = require("fs");
const puppeteer = require("puppeteer-core");
const { Cluster } = require("puppeteer-cluster");
const child_process = require("child_process");
const fetch = require("node-fetch");
@ -24,30 +25,42 @@ process.once('SIGTERM', (code) => {
});
const autoplayScript = fs.readFileSync("./autoplay.js", "utf-8");
// prefix for direct capture via pywb
const capturePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/capture/record/id_/`;
const headers = {"User-Agent": CHROME_USER_AGENT};
async function run(params) {
// Chrome Flags, including proxy server
const args = [
"--no-xshm", // needed for Chrome >80 (check if puppeteer adds automatically)
`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`,
"--no-sandbox"
"--no-sandbox",
"--disable-background-media-suspend",
"--autoplay-policy=no-user-gesture-required",
];
// prefix for direct capture via pywb
const capturePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/capture/record/id_/`;
// Puppeter Options
const puppeteerOptions = {
headless: true,
//executablePath: "/usr/bin/google-chrome",
executablePath: "/opt/google/chrome/google-chrome",
ignoreHTTPSErrors: true,
args
};
// params
const { url, waitUntil, timeout, scope, limit, exclude, scroll } = params;
// Puppeteer Cluster init and options
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_PAGE,
maxConcurrency: Number(params.workers) || 1,
skipDuplicateUrls: true,
// total timeout for cluster
timeout: timeout * 2,
puppeteerOptions,
puppeteer,
monitor: true
@ -56,9 +69,6 @@ async function run(params) {
// Maintain own seen list
const seenList = new Set();
// params
const { url, waitUntil, timeout, scope, limit, exclude, scroll } = params;
//console.log("Limit: " + limit);
// links crawled counter
@ -72,12 +82,46 @@ async function run(params) {
return;
}
//page.on('console', message => console.log(`${message.type()} ${message.text()}`));
//page.on('pageerror', message => console.warn(message));
//page.on('error', message => console.warn(message));
//page.on('requestfailed', message => console.warn(message._failureText));
const mediaResults = [];
await page.exposeFunction('__crawler_queueUrls', (url) => {
mediaResults.push(directCapture(url));
});
let waitForVideo = false;
await page.exposeFunction('__crawler_autoplayLoad', (url) => {
console.log("*** Loading autoplay URL: " + url);
waitForVideo = true;
});
try {
await page.evaluateOnNewDocument(autoplayScript);
} catch(e) {
console.log(e);
}
try {
await page.goto(url, {waitUntil, timeout});
} catch (e) {
console.log(`Load timeout for ${url}`);
}
try {
await Promise.all(mediaResults);
} catch (e) {
console.log(`Error loading media URLs`, e);
}
if (waitForVideo) {
console.log("Extra wait 15s for video loading");
await sleep(15000);
}
if (scroll) {
try {
await Promise.race([page.evaluate(autoScroll), sleep(30000)]);
@ -166,8 +210,6 @@ function shouldCrawl(scope, seenList, url, exclude) {
async function htmlCheck(url, capturePrefix) {
try {
const headers = {"User-Agent": CHROME_USER_AGENT};
const agent = url.startsWith("https:") ? HTTPS_AGENT : null;
const resp = await fetch(url, {method: "HEAD", headers, agent});
@ -191,11 +233,7 @@ async function htmlCheck(url, capturePrefix) {
}
// capture directly
console.log(`Direct capture: ${capturePrefix}${url}`);
const abort = new AbortController();
const signal = abort.signal;
const resp2 = await fetch(capturePrefix + url, {signal, headers});
abort.abort();
await directCapture(url);
return false;
} catch(e) {
@ -205,6 +243,15 @@ async function htmlCheck(url, capturePrefix) {
}
}
async function directCapture(url) {
console.log(`Direct capture: ${capturePrefix}${url}`);
const abort = new AbortController();
const signal = abort.signal;
const resp2 = await fetch(capturePrefix + url, {signal, headers});
abort.abort();
}
async function autoScroll() {
const canScrollMore = () =>
@ -317,6 +364,7 @@ async function main() {
.argv;
console.log("Exclusions Regexes: ", params.exclude);
console.log("Scope: ", params.scope);
try {
await run(params);

View File

@ -8,8 +8,8 @@
"dependencies": {
"abort-controller": "^3.0.0",
"node-fetch": "^2.6.1",
"puppeteer": "^5.3.0",
"puppeteer-cluster": "^0.22.0",
"puppeteer-core": "^5.3.1",
"yargs": "^16.0.3"
}
}

33
sw.js Normal file

File diff suppressed because one or more lines are too long

View File

@ -8,9 +8,9 @@
integrity sha512-rr+OQyAjxze7GgWrSaJwydHStIhHq2lvY3BOC2Mj7KnzI7XK0Uw1TOOdI9lDoajEbSWLiYgoo4f1R51erQfhPQ==
"@types/node@*":
version "14.11.1"
resolved "https://registry.yarnpkg.com/@types/node/-/node-14.11.1.tgz#56af902ad157e763f9ba63d671c39cda3193c835"
integrity sha512-oTQgnd0hblfLsJ6BvJzzSL+Inogp3lq9fGgqRkMB/ziKMgEUaFl801OncOzUmalfzt14N0oPHMK47ipl+wbTIw==
version "14.14.0"
resolved "https://registry.yarnpkg.com/@types/node/-/node-14.14.0.tgz#f1091b6ad5de18e8e91bdbd43ec63f13de372538"
integrity sha512-BfbIHP9IapdupGhq/hc+jT5dyiBVZ2DdeC5WwJWQWDb0GijQlzUFAeIQn/2GtvZcd2HVUU7An8felIICFTC2qg==
"@types/yauzl@^2.9.1":
version "2.9.1"
@ -247,11 +247,6 @@ locate-path@^5.0.0:
dependencies:
p-locate "^4.1.0"
mime@^2.0.3:
version "2.4.6"
resolved "https://registry.yarnpkg.com/mime/-/mime-2.4.6.tgz#e5b407c90db442f2beb5b162373d07b69affa4d1"
integrity sha512-RZKhC3EmpBchfTGBVb8fb+RL2cWyw/32lshnsETttkBAyAUXSGHxbEJWWRXc751DrIxG1q04b8QwMbAwkRPpUA==
minimatch@^3.0.4:
version "3.0.4"
resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-3.0.4.tgz#5166e286457f03306064be5497e8dbb0c3d32083"
@ -347,16 +342,15 @@ puppeteer-cluster@^0.22.0:
dependencies:
debug "^4.1.1"
puppeteer@^5.3.0:
version "5.3.0"
resolved "https://registry.yarnpkg.com/puppeteer/-/puppeteer-5.3.0.tgz#0abf83d0f2d1273baf2b56885a813f8052903e33"
integrity sha512-GjqMk5GRro3TO0sw3QMsF1H7n+/jaK2OW45qMvqjYUyJ7y4oA//9auy969HHhTG3HZXaMxY/NWXF/NXlAFIvtw==
puppeteer-core@^5.3.1:
version "5.3.1"
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-5.3.1.tgz#1affb1738afac499416a7fd4ed2ed0c18577e88f"
integrity sha512-YE6c6FvHAFKQUyNTqFs78SgGmpcqOPhhmVfEVNYB4abv7bV2V+B3r72T3e7vlJkEeTloy4x9bQLrGbHHoKSg1w==
dependencies:
debug "^4.1.0"
devtools-protocol "0.0.799653"
extract-zip "^2.0.0"
https-proxy-agent "^4.0.0"
mime "^2.0.3"
pkg-dir "^4.2.0"
progress "^2.0.1"
proxy-from-env "^1.0.0"

View File

@ -86,6 +86,10 @@ def zimit(args=None):
warc2zim_args.append("--url")
warc2zim_args.append(zimit_args.url)
subprocess.Popen(["/usr/bin/env", "python", "-m", "http.server", "9990"])
warc2zim_args.append("-r")
warc2zim_args.append("http://localhost:9990/")
print("----------")
print("Testing warc2zim args")
print("Running: warc2zim " + " ".join(warc2zim_args))