mirror of
https://github.com/openzim/zimit.git
synced 2025-09-28 15:33:57 -04:00
Merge pull request #36 from openzim/video-capture-work
work on automated capture of video (#9)
This commit is contained in:
commit
f6282dbf14
@ -15,12 +15,14 @@ ENV PROXY_HOST=localhost \
|
|||||||
|
|
||||||
RUN pip install gevent>=20.9.0 uwsgi
|
RUN pip install gevent>=20.9.0 uwsgi
|
||||||
|
|
||||||
RUN pip install warc2zim==1.2.0
|
#RUN pip install git+https://github.com/openzim/warc2zim@fuzzy-match
|
||||||
|
RUN pip install 'warc2zim>=1.3.0'
|
||||||
|
|
||||||
RUN pip install git+https://github.com/webrecorder/pywb@patch-work
|
RUN pip install git+https://github.com/webrecorder/pywb@patch-work
|
||||||
|
|
||||||
COPY --from=chrome /usr/lib/x86_64-linux-gnu/ /usr/lib/x86_64-linux-gnu/
|
COPY --from=chrome /usr/lib/x86_64-linux-gnu/ /usr/lib/x86_64-linux-gnu/
|
||||||
COPY --from=chrome /lib/x86_64-linux-gnu/libdbus* /lib/x86_64-linux-gnu/
|
COPY --from=chrome /lib/x86_64-linux-gnu/libdbus* /lib/x86_64-linux-gnu/
|
||||||
|
COPY --from=chrome /opt/google/chrome/ /opt/google/chrome/
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
@ -32,6 +34,8 @@ ADD config.yaml /app/
|
|||||||
ADD uwsgi.ini /app/
|
ADD uwsgi.ini /app/
|
||||||
ADD zimit.py /app/
|
ADD zimit.py /app/
|
||||||
ADD crawler.js /app/
|
ADD crawler.js /app/
|
||||||
|
ADD autoplay.js /app/
|
||||||
|
|
||||||
RUN ln -s /app/zimit.py /usr/bin/zimit
|
RUN ln -s /app/zimit.py /usr/bin/zimit
|
||||||
|
|
||||||
CMD ["zimit"]
|
CMD ["zimit"]
|
||||||
|
89
autoplay.js
Normal file
89
autoplay.js
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
(() => {
|
||||||
|
|
||||||
|
function run() {
|
||||||
|
if (window.navigator.__crawler_autoplay) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
//console.log("checking autoplay for " + document.location.href);
|
||||||
|
window.navigator.__crawler_autoplay = true;
|
||||||
|
|
||||||
|
const specialActions = [
|
||||||
|
{
|
||||||
|
rx: /w\.soundcloud\.com/,
|
||||||
|
check(url) {
|
||||||
|
const autoplay = url.searchParams.get('auto_play');
|
||||||
|
return autoplay === 'true';
|
||||||
|
},
|
||||||
|
handle(url) {
|
||||||
|
url.searchParams.set('auto_play', 'true');
|
||||||
|
// set continuous_play to true in order to handle
|
||||||
|
// a playlist etc
|
||||||
|
url.searchParams.set('continuous_play', 'true');
|
||||||
|
self.location.href = url.href;
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
rx: [/player\.vimeo\.com/, /youtube\.com\/embed\//],
|
||||||
|
check(url) {
|
||||||
|
const autoplay = url.searchParams.get('autoplay');
|
||||||
|
return autoplay === '1';
|
||||||
|
},
|
||||||
|
handle(url) {
|
||||||
|
url.searchParams.set('autoplay', '1');
|
||||||
|
if (window.__crawler_autoplayLoad) {
|
||||||
|
window.__crawler_autoplayLoad(url.href);
|
||||||
|
}
|
||||||
|
self.location.href = url.href;
|
||||||
|
},
|
||||||
|
},
|
||||||
|
];
|
||||||
|
const url = new URL(self.location.href);
|
||||||
|
for (let i = 0; i < specialActions.length; i++) {
|
||||||
|
if (Array.isArray(specialActions[i].rx)) {
|
||||||
|
const rxs = specialActions[i].rx;
|
||||||
|
for (let j = 0; j < rxs.length; j++) {
|
||||||
|
if (url.href.search(rxs[j]) >= 0) {
|
||||||
|
if (specialActions[i].check(url)) return;
|
||||||
|
return specialActions[i].handle(url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (url.href.search(specialActions[i].rx) >= 0) {
|
||||||
|
if (specialActions[i].check(url)) return;
|
||||||
|
return specialActions[i].handle(url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
document.addEventListener("readystatechange", run);
|
||||||
|
|
||||||
|
if (document.readyState === "complete") {
|
||||||
|
run();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
const mediaSet = new Set();
|
||||||
|
|
||||||
|
setInterval(() => {
|
||||||
|
const medias = document.querySelectorAll("video, audio");
|
||||||
|
|
||||||
|
for (const media of medias) {
|
||||||
|
try {
|
||||||
|
if (media.src && !mediaSet.has(media.src)) {
|
||||||
|
if (window.__crawler_queueUrls && (media.src.startsWith("http:") || media.src.startsWith("https:"))) {
|
||||||
|
window.__crawler_queueUrls(media.src);
|
||||||
|
}
|
||||||
|
mediaSet.add(media.src);
|
||||||
|
} else if (!media.src) {
|
||||||
|
media.play();
|
||||||
|
}
|
||||||
|
} catch(e) {
|
||||||
|
console.log(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}, 3000);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
})();
|
||||||
|
|
147
crawler.js
147
crawler.js
@ -1,4 +1,5 @@
|
|||||||
const puppeteer = require("puppeteer");
|
const fs = require("fs");
|
||||||
|
const puppeteer = require("puppeteer-core");
|
||||||
const { Cluster } = require("puppeteer-cluster");
|
const { Cluster } = require("puppeteer-cluster");
|
||||||
const child_process = require("child_process");
|
const child_process = require("child_process");
|
||||||
const fetch = require("node-fetch");
|
const fetch = require("node-fetch");
|
||||||
@ -6,6 +7,7 @@ const AbortController = require("abort-controller");
|
|||||||
|
|
||||||
const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
|
const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
|
||||||
const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
|
const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
|
||||||
|
const NEW_CONTEXT_OPTS = ["page", "session", "browser"];
|
||||||
const CHROME_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36";
|
const CHROME_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36";
|
||||||
|
|
||||||
// to ignore HTTPS error for HEAD check
|
// to ignore HTTPS error for HEAD check
|
||||||
@ -24,30 +26,58 @@ process.once('SIGTERM', (code) => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
const autoplayScript = fs.readFileSync("./autoplay.js", "utf-8");
|
||||||
|
|
||||||
|
|
||||||
|
// prefix for direct capture via pywb
|
||||||
|
const capturePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/capture/record/id_/`;
|
||||||
|
const headers = {"User-Agent": CHROME_USER_AGENT};
|
||||||
|
|
||||||
|
|
||||||
async function run(params) {
|
async function run(params) {
|
||||||
// Chrome Flags, including proxy server
|
// Chrome Flags, including proxy server
|
||||||
const args = [
|
const args = [
|
||||||
"--no-xshm", // needed for Chrome >80 (check if puppeteer adds automatically)
|
"--no-xshm", // needed for Chrome >80 (check if puppeteer adds automatically)
|
||||||
`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`,
|
`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`,
|
||||||
"--no-sandbox"
|
"--no-sandbox",
|
||||||
|
"--disable-background-media-suspend",
|
||||||
|
"--autoplay-policy=no-user-gesture-required",
|
||||||
];
|
];
|
||||||
|
|
||||||
// prefix for direct capture via pywb
|
|
||||||
const capturePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/capture/record/id_/`;
|
|
||||||
|
|
||||||
// Puppeter Options
|
// Puppeter Options
|
||||||
const puppeteerOptions = {
|
const puppeteerOptions = {
|
||||||
headless: true,
|
headless: true,
|
||||||
//executablePath: "/usr/bin/google-chrome",
|
executablePath: "/opt/google/chrome/google-chrome",
|
||||||
ignoreHTTPSErrors: true,
|
ignoreHTTPSErrors: true,
|
||||||
args
|
args
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// params
|
||||||
|
const { url, waitUntil, timeout, scope, limit, exclude, scroll, newContext } = params;
|
||||||
|
|
||||||
|
let concurrency = Cluster.CONCURRENCY_PAGE;
|
||||||
|
|
||||||
|
switch (newContext) {
|
||||||
|
case "page":
|
||||||
|
concurrency = Cluster.CONCURRENCY_PAGE;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "session":
|
||||||
|
concurrency = Cluster.CONCURRENCY_CONTEXT;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "browser":
|
||||||
|
concurrency = Cluster.CONCURRENCY_BROWSER;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
// Puppeteer Cluster init and options
|
// Puppeteer Cluster init and options
|
||||||
const cluster = await Cluster.launch({
|
const cluster = await Cluster.launch({
|
||||||
concurrency: Cluster.CONCURRENCY_PAGE,
|
concurrency,
|
||||||
maxConcurrency: Number(params.workers) || 1,
|
maxConcurrency: Number(params.workers) || 1,
|
||||||
skipDuplicateUrls: true,
|
skipDuplicateUrls: true,
|
||||||
|
// total timeout for cluster
|
||||||
|
timeout: timeout * 2,
|
||||||
puppeteerOptions,
|
puppeteerOptions,
|
||||||
puppeteer,
|
puppeteer,
|
||||||
monitor: true
|
monitor: true
|
||||||
@ -56,9 +86,6 @@ async function run(params) {
|
|||||||
// Maintain own seen list
|
// Maintain own seen list
|
||||||
const seenList = new Set();
|
const seenList = new Set();
|
||||||
|
|
||||||
// params
|
|
||||||
const { url, waitUntil, timeout, scope, limit, exclude, scroll } = params;
|
|
||||||
|
|
||||||
//console.log("Limit: " + limit);
|
//console.log("Limit: " + limit);
|
||||||
|
|
||||||
// links crawled counter
|
// links crawled counter
|
||||||
@ -72,12 +99,46 @@ async function run(params) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//page.on('console', message => console.log(`${message.type()} ${message.text()}`));
|
||||||
|
//page.on('pageerror', message => console.warn(message));
|
||||||
|
//page.on('error', message => console.warn(message));
|
||||||
|
//page.on('requestfailed', message => console.warn(message._failureText));
|
||||||
|
const mediaResults = [];
|
||||||
|
|
||||||
|
await page.exposeFunction('__crawler_queueUrls', (url) => {
|
||||||
|
mediaResults.push(directCapture(url));
|
||||||
|
});
|
||||||
|
|
||||||
|
let waitForVideo = false;
|
||||||
|
|
||||||
|
await page.exposeFunction('__crawler_autoplayLoad', (url) => {
|
||||||
|
console.log("*** Loading autoplay URL: " + url);
|
||||||
|
waitForVideo = true;
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
await page.evaluateOnNewDocument(autoplayScript);
|
||||||
|
} catch(e) {
|
||||||
|
console.log(e);
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await page.goto(url, {waitUntil, timeout});
|
await page.goto(url, {waitUntil, timeout});
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.log(`Load timeout for ${url}`);
|
console.log(`Load timeout for ${url}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
await Promise.all(mediaResults);
|
||||||
|
} catch (e) {
|
||||||
|
console.log(`Error loading media URLs`, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (waitForVideo) {
|
||||||
|
console.log("Extra wait 15s for video loading");
|
||||||
|
await sleep(15000);
|
||||||
|
}
|
||||||
|
|
||||||
if (scroll) {
|
if (scroll) {
|
||||||
try {
|
try {
|
||||||
await Promise.race([page.evaluate(autoScroll), sleep(30000)]);
|
await Promise.race([page.evaluate(autoScroll), sleep(30000)]);
|
||||||
@ -148,8 +209,18 @@ function shouldCrawl(scope, seenList, url, exclude) {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// if scope is provided, skip urls not in scope
|
let inScope = false;
|
||||||
if (scope && !url.startsWith(scope)) {
|
|
||||||
|
// check scopes
|
||||||
|
for (const s of scope) {
|
||||||
|
if (s.exec(url)) {
|
||||||
|
inScope = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!inScope) {
|
||||||
|
//console.log(`Not in scope ${url} ${scope}`);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -166,8 +237,6 @@ function shouldCrawl(scope, seenList, url, exclude) {
|
|||||||
|
|
||||||
async function htmlCheck(url, capturePrefix) {
|
async function htmlCheck(url, capturePrefix) {
|
||||||
try {
|
try {
|
||||||
const headers = {"User-Agent": CHROME_USER_AGENT};
|
|
||||||
|
|
||||||
const agent = url.startsWith("https:") ? HTTPS_AGENT : null;
|
const agent = url.startsWith("https:") ? HTTPS_AGENT : null;
|
||||||
|
|
||||||
const resp = await fetch(url, {method: "HEAD", headers, agent});
|
const resp = await fetch(url, {method: "HEAD", headers, agent});
|
||||||
@ -191,11 +260,7 @@ async function htmlCheck(url, capturePrefix) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// capture directly
|
// capture directly
|
||||||
console.log(`Direct capture: ${capturePrefix}${url}`);
|
await directCapture(url);
|
||||||
const abort = new AbortController();
|
|
||||||
const signal = abort.signal;
|
|
||||||
const resp2 = await fetch(capturePrefix + url, {signal, headers});
|
|
||||||
abort.abort();
|
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
} catch(e) {
|
} catch(e) {
|
||||||
@ -205,6 +270,15 @@ async function htmlCheck(url, capturePrefix) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function directCapture(url) {
|
||||||
|
console.log(`Direct capture: ${capturePrefix}${url}`);
|
||||||
|
const abort = new AbortController();
|
||||||
|
const signal = abort.signal;
|
||||||
|
const resp2 = await fetch(capturePrefix + url, {signal, headers});
|
||||||
|
abort.abort();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
async function autoScroll() {
|
async function autoScroll() {
|
||||||
const canScrollMore = () =>
|
const canScrollMore = () =>
|
||||||
@ -249,6 +323,12 @@ async function main() {
|
|||||||
type: "number",
|
type: "number",
|
||||||
},
|
},
|
||||||
|
|
||||||
|
"newContext": {
|
||||||
|
describe: "The context for each new capture, can be a new: page, session or browser.",
|
||||||
|
default: "page",
|
||||||
|
type: "string"
|
||||||
|
},
|
||||||
|
|
||||||
"waitUntil": {
|
"waitUntil": {
|
||||||
describe: "Puppeteer page.goto() condition to wait for before continuing",
|
describe: "Puppeteer page.goto() condition to wait for before continuing",
|
||||||
default: "load",
|
default: "load",
|
||||||
@ -267,11 +347,11 @@ async function main() {
|
|||||||
},
|
},
|
||||||
|
|
||||||
"scope": {
|
"scope": {
|
||||||
describe: "The scope of current page that should be included in the crawl (defaults to the immediate directory of URL)",
|
describe: "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)",
|
||||||
},
|
},
|
||||||
|
|
||||||
"exclude": {
|
"exclude": {
|
||||||
describe: "Regex of URLs that should be excluded from the crawl."
|
describe: "Regex of page URLs that should be excluded from the crawl."
|
||||||
},
|
},
|
||||||
|
|
||||||
"scroll": {
|
"scroll": {
|
||||||
@ -291,7 +371,8 @@ async function main() {
|
|||||||
argv.url = url.href;
|
argv.url = url.href;
|
||||||
|
|
||||||
if (!argv.scope) {
|
if (!argv.scope) {
|
||||||
argv.scope = url.href.slice(0, url.href.lastIndexOf("/") + 1);
|
//argv.scope = url.href.slice(0, url.href.lastIndexOf("/") + 1);
|
||||||
|
argv.scope = [new RegExp("^" + rxEscape(url.href.slice(0, url.href.lastIndexOf("/") + 1)))];
|
||||||
}
|
}
|
||||||
|
|
||||||
argv.timeout *= 1000;
|
argv.timeout *= 1000;
|
||||||
@ -302,6 +383,11 @@ async function main() {
|
|||||||
throw new Error("Invalid waitUntil, must be one of: " + WAIT_UNTIL_OPTS.join(","));
|
throw new Error("Invalid waitUntil, must be one of: " + WAIT_UNTIL_OPTS.join(","));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!NEW_CONTEXT_OPTS.includes(argv.newContext)) {
|
||||||
|
throw new Error("Invalid newContext, must be one of: " + NEW_CONTEXT_OPTS.join(","));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Support one or multiple exclude
|
||||||
if (argv.exclude) {
|
if (argv.exclude) {
|
||||||
if (typeof(argv.exclude) === "string") {
|
if (typeof(argv.exclude) === "string") {
|
||||||
argv.exclude = [new RegExp(argv.exclude)];
|
argv.exclude = [new RegExp(argv.exclude)];
|
||||||
@ -312,11 +398,23 @@ async function main() {
|
|||||||
argv.exclude = [];
|
argv.exclude = [];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Support one or multiple scopes
|
||||||
|
if (argv.scope) {
|
||||||
|
if (typeof(argv.scope) === "string") {
|
||||||
|
argv.scope = [new RegExp(argv.scope)];
|
||||||
|
} else {
|
||||||
|
argv.scope = argv.scope.map(e => new RegExp(e));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
argv.scope = [];
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
})
|
})
|
||||||
.argv;
|
.argv;
|
||||||
|
|
||||||
console.log("Exclusions Regexes: ", params.exclude);
|
console.log("Exclusions Regexes: ", params.exclude);
|
||||||
|
console.log("Scope Regexes: ", params.scope);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await run(params);
|
await run(params);
|
||||||
@ -328,6 +426,11 @@ async function main() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function rxEscape(string) {
|
||||||
|
return string.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&');
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
main();
|
main();
|
||||||
|
|
||||||
|
|
||||||
|
@ -8,8 +8,8 @@
|
|||||||
"dependencies": {
|
"dependencies": {
|
||||||
"abort-controller": "^3.0.0",
|
"abort-controller": "^3.0.0",
|
||||||
"node-fetch": "^2.6.1",
|
"node-fetch": "^2.6.1",
|
||||||
"puppeteer": "^5.3.0",
|
|
||||||
"puppeteer-cluster": "^0.22.0",
|
"puppeteer-cluster": "^0.22.0",
|
||||||
|
"puppeteer-core": "^5.3.1",
|
||||||
"yargs": "^16.0.3"
|
"yargs": "^16.0.3"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
20
yarn.lock
20
yarn.lock
@ -8,9 +8,9 @@
|
|||||||
integrity sha512-rr+OQyAjxze7GgWrSaJwydHStIhHq2lvY3BOC2Mj7KnzI7XK0Uw1TOOdI9lDoajEbSWLiYgoo4f1R51erQfhPQ==
|
integrity sha512-rr+OQyAjxze7GgWrSaJwydHStIhHq2lvY3BOC2Mj7KnzI7XK0Uw1TOOdI9lDoajEbSWLiYgoo4f1R51erQfhPQ==
|
||||||
|
|
||||||
"@types/node@*":
|
"@types/node@*":
|
||||||
version "14.11.1"
|
version "14.14.0"
|
||||||
resolved "https://registry.yarnpkg.com/@types/node/-/node-14.11.1.tgz#56af902ad157e763f9ba63d671c39cda3193c835"
|
resolved "https://registry.yarnpkg.com/@types/node/-/node-14.14.0.tgz#f1091b6ad5de18e8e91bdbd43ec63f13de372538"
|
||||||
integrity sha512-oTQgnd0hblfLsJ6BvJzzSL+Inogp3lq9fGgqRkMB/ziKMgEUaFl801OncOzUmalfzt14N0oPHMK47ipl+wbTIw==
|
integrity sha512-BfbIHP9IapdupGhq/hc+jT5dyiBVZ2DdeC5WwJWQWDb0GijQlzUFAeIQn/2GtvZcd2HVUU7An8felIICFTC2qg==
|
||||||
|
|
||||||
"@types/yauzl@^2.9.1":
|
"@types/yauzl@^2.9.1":
|
||||||
version "2.9.1"
|
version "2.9.1"
|
||||||
@ -247,11 +247,6 @@ locate-path@^5.0.0:
|
|||||||
dependencies:
|
dependencies:
|
||||||
p-locate "^4.1.0"
|
p-locate "^4.1.0"
|
||||||
|
|
||||||
mime@^2.0.3:
|
|
||||||
version "2.4.6"
|
|
||||||
resolved "https://registry.yarnpkg.com/mime/-/mime-2.4.6.tgz#e5b407c90db442f2beb5b162373d07b69affa4d1"
|
|
||||||
integrity sha512-RZKhC3EmpBchfTGBVb8fb+RL2cWyw/32lshnsETttkBAyAUXSGHxbEJWWRXc751DrIxG1q04b8QwMbAwkRPpUA==
|
|
||||||
|
|
||||||
minimatch@^3.0.4:
|
minimatch@^3.0.4:
|
||||||
version "3.0.4"
|
version "3.0.4"
|
||||||
resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-3.0.4.tgz#5166e286457f03306064be5497e8dbb0c3d32083"
|
resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-3.0.4.tgz#5166e286457f03306064be5497e8dbb0c3d32083"
|
||||||
@ -347,16 +342,15 @@ puppeteer-cluster@^0.22.0:
|
|||||||
dependencies:
|
dependencies:
|
||||||
debug "^4.1.1"
|
debug "^4.1.1"
|
||||||
|
|
||||||
puppeteer@^5.3.0:
|
puppeteer-core@^5.3.1:
|
||||||
version "5.3.0"
|
version "5.3.1"
|
||||||
resolved "https://registry.yarnpkg.com/puppeteer/-/puppeteer-5.3.0.tgz#0abf83d0f2d1273baf2b56885a813f8052903e33"
|
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-5.3.1.tgz#1affb1738afac499416a7fd4ed2ed0c18577e88f"
|
||||||
integrity sha512-GjqMk5GRro3TO0sw3QMsF1H7n+/jaK2OW45qMvqjYUyJ7y4oA//9auy969HHhTG3HZXaMxY/NWXF/NXlAFIvtw==
|
integrity sha512-YE6c6FvHAFKQUyNTqFs78SgGmpcqOPhhmVfEVNYB4abv7bV2V+B3r72T3e7vlJkEeTloy4x9bQLrGbHHoKSg1w==
|
||||||
dependencies:
|
dependencies:
|
||||||
debug "^4.1.0"
|
debug "^4.1.0"
|
||||||
devtools-protocol "0.0.799653"
|
devtools-protocol "0.0.799653"
|
||||||
extract-zip "^2.0.0"
|
extract-zip "^2.0.0"
|
||||||
https-proxy-agent "^4.0.0"
|
https-proxy-agent "^4.0.0"
|
||||||
mime "^2.0.3"
|
|
||||||
pkg-dir "^4.2.0"
|
pkg-dir "^4.2.0"
|
||||||
progress "^2.0.1"
|
progress "^2.0.1"
|
||||||
proxy-from-env "^1.0.0"
|
proxy-from-env "^1.0.0"
|
||||||
|
12
zimit.py
12
zimit.py
@ -31,6 +31,13 @@ def zimit(args=None):
|
|||||||
|
|
||||||
parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers")
|
parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers")
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--newContext",
|
||||||
|
help="The context for each new capture, can be a new: page, session or browser.",
|
||||||
|
choices=["page", "session", "browser"],
|
||||||
|
default="page",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--waitUntil",
|
"--waitUntil",
|
||||||
help="Puppeteer page.goto() condition to wait for before continuing",
|
help="Puppeteer page.goto() condition to wait for before continuing",
|
||||||
@ -51,11 +58,11 @@ def zimit(args=None):
|
|||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--scope",
|
"--scope",
|
||||||
help="The scope of current page that should be included in the crawl (defaults to the immediate directory of the URL)",
|
help="Regex of page URLs that should be included in the crawl (defaults to the immediate directory of the URL)",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--exclude", help="Regex of URLs that should be excluded from the crawl."
|
"--exclude", help="Regex of page URLs that should be excluded from the crawl."
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -150,6 +157,7 @@ def get_node_cmd_line(args):
|
|||||||
for arg in [
|
for arg in [
|
||||||
"url",
|
"url",
|
||||||
"workers",
|
"workers",
|
||||||
|
"newContext",
|
||||||
"waitUntil",
|
"waitUntil",
|
||||||
"limit",
|
"limit",
|
||||||
"timeout",
|
"timeout",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user