mirror of
https://github.com/openzim/zimit.git
synced 2025-09-22 11:22:23 -04:00
split zimit from core browsertrix-crawler, which has been moved to https://github.com/webrecorder/browsertrix-crawler
use versioned browsertrix-crawler:0.1.0 image part of #45
This commit is contained in:
parent
f6282dbf14
commit
c228c8300c
1
.gitignore
vendored
1
.gitignore
vendored
@ -4,3 +4,4 @@ __pycache__
|
|||||||
*.egg-info/
|
*.egg-info/
|
||||||
collections/
|
collections/
|
||||||
node_modules/
|
node_modules/
|
||||||
|
output/
|
||||||
|
34
Dockerfile
34
Dockerfile
@ -1,40 +1,10 @@
|
|||||||
FROM oldwebtoday/chrome:84 as chrome
|
FROM webrecorder/browsertrix-crawler:0.1.0
|
||||||
|
|
||||||
FROM nikolaik/python-nodejs:python3.8-nodejs14
|
|
||||||
|
|
||||||
RUN apt-get update -y \
|
|
||||||
&& apt-get install --no-install-recommends -qqy fonts-stix locales-all redis-server \
|
|
||||||
&& apt-get clean \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
ENV PROXY_HOST=localhost \
|
|
||||||
PROXY_PORT=8080 \
|
|
||||||
PROXY_CA_URL=http://wsgiprox/download/pem \
|
|
||||||
PROXY_CA_FILE=/tmp/proxy-ca.pem \
|
|
||||||
NO_SOCAT=1
|
|
||||||
|
|
||||||
RUN pip install gevent>=20.9.0 uwsgi
|
|
||||||
|
|
||||||
#RUN pip install git+https://github.com/openzim/warc2zim@fuzzy-match
|
|
||||||
RUN pip install 'warc2zim>=1.3.0'
|
|
||||||
|
|
||||||
RUN pip install git+https://github.com/webrecorder/pywb@patch-work
|
|
||||||
|
|
||||||
COPY --from=chrome /usr/lib/x86_64-linux-gnu/ /usr/lib/x86_64-linux-gnu/
|
|
||||||
COPY --from=chrome /lib/x86_64-linux-gnu/libdbus* /lib/x86_64-linux-gnu/
|
|
||||||
COPY --from=chrome /opt/google/chrome/ /opt/google/chrome/
|
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
ADD package.json /app/
|
RUN pip install 'warc2zim>=1.3.0'
|
||||||
|
|
||||||
RUN yarn install
|
|
||||||
|
|
||||||
ADD config.yaml /app/
|
|
||||||
ADD uwsgi.ini /app/
|
|
||||||
ADD zimit.py /app/
|
ADD zimit.py /app/
|
||||||
ADD crawler.js /app/
|
|
||||||
ADD autoplay.js /app/
|
|
||||||
|
|
||||||
RUN ln -s /app/zimit.py /usr/bin/zimit
|
RUN ln -s /app/zimit.py /usr/bin/zimit
|
||||||
|
|
||||||
|
12
README.md
12
README.md
@ -12,17 +12,16 @@ Technical background
|
|||||||
|
|
||||||
This version of Zimit runs a single-site headless-Chrome based crawl in a Docker container and produces a ZIM of the crawled content.
|
This version of Zimit runs a single-site headless-Chrome based crawl in a Docker container and produces a ZIM of the crawled content.
|
||||||
|
|
||||||
The system uses:
|
The system extends the crawling system in [Browsertrix Crawler](https://github.com/webrecorder/browsertrix-crawler) and converts
|
||||||
- `oldwebtoday/chrome` - to install a recent version of Chrome 84
|
the crawled WARC files to ZIM using [warc2zim](https://github.com/openzim/warc2zim)
|
||||||
- `puppeteer-cluster` - for running Chrome browsers in parallel
|
|
||||||
- `pywb` - in recording mode for capturing the content
|
|
||||||
- `warc2zim` - to convert the crawled WARC files into a ZIM
|
|
||||||
|
|
||||||
The driver in `index.js` crawls a given URL using puppeteer-cluster.
|
The `zimit.py` is the entrypoint for the system.
|
||||||
|
|
||||||
After the crawl is done, warc2zim is used to write a zim to the
|
After the crawl is done, warc2zim is used to write a zim to the
|
||||||
`/output` directory, which can be mounted as a volume.
|
`/output` directory, which can be mounted as a volume.
|
||||||
|
|
||||||
|
Using the `--keep` flag, the crawled WARCs will also be kept in a temp directory inside `/output`
|
||||||
|
|
||||||
Usage
|
Usage
|
||||||
-----
|
-----
|
||||||
|
|
||||||
@ -44,6 +43,7 @@ The image accepts the following parameters:
|
|||||||
- `--limit U` - Limit capture to at most U URLs
|
- `--limit U` - Limit capture to at most U URLs
|
||||||
- `--exclude <regex>` - skip URLs that match the regex from crawling. Can be specified multiple times.
|
- `--exclude <regex>` - skip URLs that match the regex from crawling. Can be specified multiple times.
|
||||||
- `--scroll [N]` - if set, will activate a simple auto-scroll behavior on each page to scroll for upto N seconds
|
- `--scroll [N]` - if set, will activate a simple auto-scroll behavior on each page to scroll for upto N seconds
|
||||||
|
- `--keep` - if set, keep the WARC files in a temp directory inside the output directory
|
||||||
|
|
||||||
The following is an example usage. The `--cap-add` and `--shm-size`
|
The following is an example usage. The `--cap-add` and `--shm-size`
|
||||||
flags are [needed to run Chrome in Docker](https://github.com/puppeteer/puppeteer/blob/v1.0.0/docs/troubleshooting.md#tips).
|
flags are [needed to run Chrome in Docker](https://github.com/puppeteer/puppeteer/blob/v1.0.0/docs/troubleshooting.md#tips).
|
||||||
|
89
autoplay.js
89
autoplay.js
@ -1,89 +0,0 @@
|
|||||||
(() => {
|
|
||||||
|
|
||||||
function run() {
|
|
||||||
if (window.navigator.__crawler_autoplay) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
//console.log("checking autoplay for " + document.location.href);
|
|
||||||
window.navigator.__crawler_autoplay = true;
|
|
||||||
|
|
||||||
const specialActions = [
|
|
||||||
{
|
|
||||||
rx: /w\.soundcloud\.com/,
|
|
||||||
check(url) {
|
|
||||||
const autoplay = url.searchParams.get('auto_play');
|
|
||||||
return autoplay === 'true';
|
|
||||||
},
|
|
||||||
handle(url) {
|
|
||||||
url.searchParams.set('auto_play', 'true');
|
|
||||||
// set continuous_play to true in order to handle
|
|
||||||
// a playlist etc
|
|
||||||
url.searchParams.set('continuous_play', 'true');
|
|
||||||
self.location.href = url.href;
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
rx: [/player\.vimeo\.com/, /youtube\.com\/embed\//],
|
|
||||||
check(url) {
|
|
||||||
const autoplay = url.searchParams.get('autoplay');
|
|
||||||
return autoplay === '1';
|
|
||||||
},
|
|
||||||
handle(url) {
|
|
||||||
url.searchParams.set('autoplay', '1');
|
|
||||||
if (window.__crawler_autoplayLoad) {
|
|
||||||
window.__crawler_autoplayLoad(url.href);
|
|
||||||
}
|
|
||||||
self.location.href = url.href;
|
|
||||||
},
|
|
||||||
},
|
|
||||||
];
|
|
||||||
const url = new URL(self.location.href);
|
|
||||||
for (let i = 0; i < specialActions.length; i++) {
|
|
||||||
if (Array.isArray(specialActions[i].rx)) {
|
|
||||||
const rxs = specialActions[i].rx;
|
|
||||||
for (let j = 0; j < rxs.length; j++) {
|
|
||||||
if (url.href.search(rxs[j]) >= 0) {
|
|
||||||
if (specialActions[i].check(url)) return;
|
|
||||||
return specialActions[i].handle(url);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (url.href.search(specialActions[i].rx) >= 0) {
|
|
||||||
if (specialActions[i].check(url)) return;
|
|
||||||
return specialActions[i].handle(url);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
document.addEventListener("readystatechange", run);
|
|
||||||
|
|
||||||
if (document.readyState === "complete") {
|
|
||||||
run();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
const mediaSet = new Set();
|
|
||||||
|
|
||||||
setInterval(() => {
|
|
||||||
const medias = document.querySelectorAll("video, audio");
|
|
||||||
|
|
||||||
for (const media of medias) {
|
|
||||||
try {
|
|
||||||
if (media.src && !mediaSet.has(media.src)) {
|
|
||||||
if (window.__crawler_queueUrls && (media.src.startsWith("http:") || media.src.startsWith("https:"))) {
|
|
||||||
window.__crawler_queueUrls(media.src);
|
|
||||||
}
|
|
||||||
mediaSet.add(media.src);
|
|
||||||
} else if (!media.src) {
|
|
||||||
media.play();
|
|
||||||
}
|
|
||||||
} catch(e) {
|
|
||||||
console.log(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}, 3000);
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
})();
|
|
||||||
|
|
21
config.yaml
21
config.yaml
@ -1,21 +0,0 @@
|
|||||||
debug: true
|
|
||||||
|
|
||||||
|
|
||||||
proxy:
|
|
||||||
coll: capture
|
|
||||||
recording: true
|
|
||||||
|
|
||||||
enable_banner: false
|
|
||||||
enable_wombat: true
|
|
||||||
|
|
||||||
recorder:
|
|
||||||
dedup_policy: skip
|
|
||||||
source_coll: live
|
|
||||||
cache: always
|
|
||||||
|
|
||||||
#autoindex: 10
|
|
||||||
|
|
||||||
enable_auto_fetch: true
|
|
||||||
|
|
||||||
collections:
|
|
||||||
live: $live
|
|
436
crawler.js
436
crawler.js
@ -1,436 +0,0 @@
|
|||||||
const fs = require("fs");
|
|
||||||
const puppeteer = require("puppeteer-core");
|
|
||||||
const { Cluster } = require("puppeteer-cluster");
|
|
||||||
const child_process = require("child_process");
|
|
||||||
const fetch = require("node-fetch");
|
|
||||||
const AbortController = require("abort-controller");
|
|
||||||
|
|
||||||
const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
|
|
||||||
const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
|
|
||||||
const NEW_CONTEXT_OPTS = ["page", "session", "browser"];
|
|
||||||
const CHROME_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36";
|
|
||||||
|
|
||||||
// to ignore HTTPS error for HEAD check
|
|
||||||
const HTTPS_AGENT = require("https").Agent({
|
|
||||||
rejectUnauthorized: false,
|
|
||||||
});
|
|
||||||
|
|
||||||
process.once('SIGINT', (code) => {
|
|
||||||
console.log('SIGINT received, exiting');
|
|
||||||
process.exit(1);
|
|
||||||
});
|
|
||||||
|
|
||||||
process.once('SIGTERM', (code) => {
|
|
||||||
console.log('SIGTERM received, exiting');
|
|
||||||
process.exit(1);
|
|
||||||
});
|
|
||||||
|
|
||||||
|
|
||||||
const autoplayScript = fs.readFileSync("./autoplay.js", "utf-8");
|
|
||||||
|
|
||||||
|
|
||||||
// prefix for direct capture via pywb
|
|
||||||
const capturePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/capture/record/id_/`;
|
|
||||||
const headers = {"User-Agent": CHROME_USER_AGENT};
|
|
||||||
|
|
||||||
|
|
||||||
async function run(params) {
|
|
||||||
// Chrome Flags, including proxy server
|
|
||||||
const args = [
|
|
||||||
"--no-xshm", // needed for Chrome >80 (check if puppeteer adds automatically)
|
|
||||||
`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`,
|
|
||||||
"--no-sandbox",
|
|
||||||
"--disable-background-media-suspend",
|
|
||||||
"--autoplay-policy=no-user-gesture-required",
|
|
||||||
];
|
|
||||||
|
|
||||||
// Puppeter Options
|
|
||||||
const puppeteerOptions = {
|
|
||||||
headless: true,
|
|
||||||
executablePath: "/opt/google/chrome/google-chrome",
|
|
||||||
ignoreHTTPSErrors: true,
|
|
||||||
args
|
|
||||||
};
|
|
||||||
|
|
||||||
// params
|
|
||||||
const { url, waitUntil, timeout, scope, limit, exclude, scroll, newContext } = params;
|
|
||||||
|
|
||||||
let concurrency = Cluster.CONCURRENCY_PAGE;
|
|
||||||
|
|
||||||
switch (newContext) {
|
|
||||||
case "page":
|
|
||||||
concurrency = Cluster.CONCURRENCY_PAGE;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case "session":
|
|
||||||
concurrency = Cluster.CONCURRENCY_CONTEXT;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case "browser":
|
|
||||||
concurrency = Cluster.CONCURRENCY_BROWSER;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Puppeteer Cluster init and options
|
|
||||||
const cluster = await Cluster.launch({
|
|
||||||
concurrency,
|
|
||||||
maxConcurrency: Number(params.workers) || 1,
|
|
||||||
skipDuplicateUrls: true,
|
|
||||||
// total timeout for cluster
|
|
||||||
timeout: timeout * 2,
|
|
||||||
puppeteerOptions,
|
|
||||||
puppeteer,
|
|
||||||
monitor: true
|
|
||||||
});
|
|
||||||
|
|
||||||
// Maintain own seen list
|
|
||||||
const seenList = new Set();
|
|
||||||
|
|
||||||
//console.log("Limit: " + limit);
|
|
||||||
|
|
||||||
// links crawled counter
|
|
||||||
let numLinks = 0;
|
|
||||||
|
|
||||||
// Crawl Task
|
|
||||||
cluster.task(async ({page, data}) => {
|
|
||||||
const {url} = data;
|
|
||||||
|
|
||||||
if (!await htmlCheck(url, capturePrefix)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
//page.on('console', message => console.log(`${message.type()} ${message.text()}`));
|
|
||||||
//page.on('pageerror', message => console.warn(message));
|
|
||||||
//page.on('error', message => console.warn(message));
|
|
||||||
//page.on('requestfailed', message => console.warn(message._failureText));
|
|
||||||
const mediaResults = [];
|
|
||||||
|
|
||||||
await page.exposeFunction('__crawler_queueUrls', (url) => {
|
|
||||||
mediaResults.push(directCapture(url));
|
|
||||||
});
|
|
||||||
|
|
||||||
let waitForVideo = false;
|
|
||||||
|
|
||||||
await page.exposeFunction('__crawler_autoplayLoad', (url) => {
|
|
||||||
console.log("*** Loading autoplay URL: " + url);
|
|
||||||
waitForVideo = true;
|
|
||||||
});
|
|
||||||
|
|
||||||
try {
|
|
||||||
await page.evaluateOnNewDocument(autoplayScript);
|
|
||||||
} catch(e) {
|
|
||||||
console.log(e);
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
await page.goto(url, {waitUntil, timeout});
|
|
||||||
} catch (e) {
|
|
||||||
console.log(`Load timeout for ${url}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
await Promise.all(mediaResults);
|
|
||||||
} catch (e) {
|
|
||||||
console.log(`Error loading media URLs`, e);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (waitForVideo) {
|
|
||||||
console.log("Extra wait 15s for video loading");
|
|
||||||
await sleep(15000);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (scroll) {
|
|
||||||
try {
|
|
||||||
await Promise.race([page.evaluate(autoScroll), sleep(30000)]);
|
|
||||||
} catch (e) {
|
|
||||||
console.warn("Behavior Failed", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let results = null;
|
|
||||||
|
|
||||||
try {
|
|
||||||
results = await page.evaluate(() => {
|
|
||||||
return [...document.querySelectorAll('a[href]')].map(el => ({ url: el.href}))
|
|
||||||
});
|
|
||||||
} catch (e) {
|
|
||||||
console.warn("Link Extraction failed", e);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
for (data of results) {
|
|
||||||
const newUrl = shouldCrawl(scope, seenList, data.url, exclude);
|
|
||||||
|
|
||||||
if (newUrl) {
|
|
||||||
seenList.add(newUrl);
|
|
||||||
if (numLinks++ >= limit && limit > 0) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
cluster.queue({url: newUrl});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (e) {
|
|
||||||
console.log("Queuing Error: " + e);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
numLinks++;
|
|
||||||
cluster.queue({url});
|
|
||||||
|
|
||||||
await cluster.idle();
|
|
||||||
await cluster.close();
|
|
||||||
|
|
||||||
// extra wait for all resources to land into WARCs
|
|
||||||
console.log("Waiting 30s to ensure WARCs are finished");
|
|
||||||
await sleep(30000);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
function shouldCrawl(scope, seenList, url, exclude) {
|
|
||||||
try {
|
|
||||||
url = new URL(url);
|
|
||||||
} catch(e) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// remove hashtag
|
|
||||||
url.hash = "";
|
|
||||||
|
|
||||||
// only queue http/https URLs
|
|
||||||
if (url.protocol != "http:" && url.protocol != "https:") {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
url = url.href;
|
|
||||||
|
|
||||||
// skip already crawled
|
|
||||||
if (seenList.has(url)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
let inScope = false;
|
|
||||||
|
|
||||||
// check scopes
|
|
||||||
for (const s of scope) {
|
|
||||||
if (s.exec(url)) {
|
|
||||||
inScope = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!inScope) {
|
|
||||||
//console.log(`Not in scope ${url} ${scope}`);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// check exclusions
|
|
||||||
for (const e of exclude) {
|
|
||||||
if (e.exec(url)) {
|
|
||||||
//console.log(`Skipping ${url} excluded by ${e}`);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return url;
|
|
||||||
}
|
|
||||||
|
|
||||||
async function htmlCheck(url, capturePrefix) {
|
|
||||||
try {
|
|
||||||
const agent = url.startsWith("https:") ? HTTPS_AGENT : null;
|
|
||||||
|
|
||||||
const resp = await fetch(url, {method: "HEAD", headers, agent});
|
|
||||||
|
|
||||||
if (resp.status >= 400) {
|
|
||||||
console.log(`Skipping ${url}, invalid status ${resp.status}`);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
const contentType = resp.headers.get("Content-Type");
|
|
||||||
|
|
||||||
// just load if no content-type
|
|
||||||
if (!contentType) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
const mime = contentType.split(";")[0];
|
|
||||||
|
|
||||||
if (HTML_TYPES.includes(mime)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// capture directly
|
|
||||||
await directCapture(url);
|
|
||||||
|
|
||||||
return false;
|
|
||||||
} catch(e) {
|
|
||||||
console.log("HTML Check error", e);
|
|
||||||
// can't confirm not html, so try in browser
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async function directCapture(url) {
|
|
||||||
console.log(`Direct capture: ${capturePrefix}${url}`);
|
|
||||||
const abort = new AbortController();
|
|
||||||
const signal = abort.signal;
|
|
||||||
const resp2 = await fetch(capturePrefix + url, {signal, headers});
|
|
||||||
abort.abort();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
async function autoScroll() {
|
|
||||||
const canScrollMore = () =>
|
|
||||||
self.scrollY + self.innerHeight <
|
|
||||||
Math.max(
|
|
||||||
self.document.body.scrollHeight,
|
|
||||||
self.document.body.offsetHeight,
|
|
||||||
self.document.documentElement.clientHeight,
|
|
||||||
self.document.documentElement.scrollHeight,
|
|
||||||
self.document.documentElement.offsetHeight
|
|
||||||
);
|
|
||||||
|
|
||||||
const scrollOpts = { top: 250, left: 0, behavior: 'auto' };
|
|
||||||
|
|
||||||
while (canScrollMore()) {
|
|
||||||
self.scrollBy(scrollOpts);
|
|
||||||
await new Promise(resolve => setTimeout(resolve, 500));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function sleep(time) {
|
|
||||||
return new Promise(resolve => setTimeout(resolve, time));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
async function main() {
|
|
||||||
const params = require('yargs')
|
|
||||||
.usage("browsertrix-mini [options]")
|
|
||||||
.options({
|
|
||||||
"url": {
|
|
||||||
alias: "u",
|
|
||||||
describe: "The URL to start crawling from",
|
|
||||||
demandOption: true,
|
|
||||||
type: "string",
|
|
||||||
},
|
|
||||||
|
|
||||||
"workers": {
|
|
||||||
alias: "w",
|
|
||||||
describe: "The number of workers to run in parallel",
|
|
||||||
demandOption: false,
|
|
||||||
default: 1,
|
|
||||||
type: "number",
|
|
||||||
},
|
|
||||||
|
|
||||||
"newContext": {
|
|
||||||
describe: "The context for each new capture, can be a new: page, session or browser.",
|
|
||||||
default: "page",
|
|
||||||
type: "string"
|
|
||||||
},
|
|
||||||
|
|
||||||
"waitUntil": {
|
|
||||||
describe: "Puppeteer page.goto() condition to wait for before continuing",
|
|
||||||
default: "load",
|
|
||||||
},
|
|
||||||
|
|
||||||
"limit": {
|
|
||||||
describe: "Limit crawl to this number of pages",
|
|
||||||
default: 0,
|
|
||||||
type: "number",
|
|
||||||
},
|
|
||||||
|
|
||||||
"timeout": {
|
|
||||||
describe: "Timeout for each page to load (in seconds)",
|
|
||||||
default: 90,
|
|
||||||
type: "number",
|
|
||||||
},
|
|
||||||
|
|
||||||
"scope": {
|
|
||||||
describe: "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)",
|
|
||||||
},
|
|
||||||
|
|
||||||
"exclude": {
|
|
||||||
describe: "Regex of page URLs that should be excluded from the crawl."
|
|
||||||
},
|
|
||||||
|
|
||||||
"scroll": {
|
|
||||||
describe: "If set, will autoscroll to bottom of the page",
|
|
||||||
type: "boolean",
|
|
||||||
default: false,
|
|
||||||
|
|
||||||
}}).check((argv, option) => {
|
|
||||||
// Scope for crawl, default to the domain of the URL
|
|
||||||
const url = new URL(argv.url);
|
|
||||||
|
|
||||||
if (url.protocol !== "http:" && url.protocol != "https:") {
|
|
||||||
throw new Error("URL must start with http:// or https://");
|
|
||||||
}
|
|
||||||
|
|
||||||
// ensure valid url is used (adds trailing slash if missing)
|
|
||||||
argv.url = url.href;
|
|
||||||
|
|
||||||
if (!argv.scope) {
|
|
||||||
//argv.scope = url.href.slice(0, url.href.lastIndexOf("/") + 1);
|
|
||||||
argv.scope = [new RegExp("^" + rxEscape(url.href.slice(0, url.href.lastIndexOf("/") + 1)))];
|
|
||||||
}
|
|
||||||
|
|
||||||
argv.timeout *= 1000;
|
|
||||||
|
|
||||||
// waitUntil condition must be: load, domcontentloaded, networkidle0, networkidle2
|
|
||||||
// (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options)
|
|
||||||
if (!WAIT_UNTIL_OPTS.includes(argv.waitUntil)) {
|
|
||||||
throw new Error("Invalid waitUntil, must be one of: " + WAIT_UNTIL_OPTS.join(","));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!NEW_CONTEXT_OPTS.includes(argv.newContext)) {
|
|
||||||
throw new Error("Invalid newContext, must be one of: " + NEW_CONTEXT_OPTS.join(","));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Support one or multiple exclude
|
|
||||||
if (argv.exclude) {
|
|
||||||
if (typeof(argv.exclude) === "string") {
|
|
||||||
argv.exclude = [new RegExp(argv.exclude)];
|
|
||||||
} else {
|
|
||||||
argv.exclude = argv.exclude.map(e => new RegExp(e));
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
argv.exclude = [];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Support one or multiple scopes
|
|
||||||
if (argv.scope) {
|
|
||||||
if (typeof(argv.scope) === "string") {
|
|
||||||
argv.scope = [new RegExp(argv.scope)];
|
|
||||||
} else {
|
|
||||||
argv.scope = argv.scope.map(e => new RegExp(e));
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
argv.scope = [];
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
})
|
|
||||||
.argv;
|
|
||||||
|
|
||||||
console.log("Exclusions Regexes: ", params.exclude);
|
|
||||||
console.log("Scope Regexes: ", params.scope);
|
|
||||||
|
|
||||||
try {
|
|
||||||
await run(params);
|
|
||||||
process.exit(0);
|
|
||||||
} catch(e) {
|
|
||||||
console.error("Crawl failed");
|
|
||||||
console.error(e);
|
|
||||||
process.exit(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function rxEscape(string) {
|
|
||||||
return string.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&');
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
main();
|
|
||||||
|
|
||||||
|
|
15
package.json
15
package.json
@ -1,15 +0,0 @@
|
|||||||
{
|
|
||||||
"name": "zimit-crawler",
|
|
||||||
"version": "1.0.0",
|
|
||||||
"main": "zimit-crawler",
|
|
||||||
"repository": "https://github.com/openzim/zimit",
|
|
||||||
"author": "Ilya Kreymer <ikreymer@gmail.com>",
|
|
||||||
"license": "MIT",
|
|
||||||
"dependencies": {
|
|
||||||
"abort-controller": "^3.0.0",
|
|
||||||
"node-fetch": "^2.6.1",
|
|
||||||
"puppeteer-cluster": "^0.22.0",
|
|
||||||
"puppeteer-core": "^5.3.1",
|
|
||||||
"yargs": "^16.0.3"
|
|
||||||
}
|
|
||||||
}
|
|
27
uwsgi.ini
27
uwsgi.ini
@ -1,27 +0,0 @@
|
|||||||
[uwsgi]
|
|
||||||
if-not-env = PORT
|
|
||||||
http-socket = :8080
|
|
||||||
socket = :8081
|
|
||||||
endif =
|
|
||||||
|
|
||||||
master = true
|
|
||||||
buffer-size = 65536
|
|
||||||
die-on-term = true
|
|
||||||
|
|
||||||
if-env = VIRTUAL_ENV
|
|
||||||
venv = $(VIRTUAL_ENV)
|
|
||||||
endif =
|
|
||||||
|
|
||||||
gevent = 200
|
|
||||||
|
|
||||||
#Not available until uwsgi 2.1
|
|
||||||
#monkey-patching manually in pywb.apps.wayback
|
|
||||||
#gevent-early-monkey-patch =
|
|
||||||
# for uwsgi<2.1, set env when using gevent
|
|
||||||
env = GEVENT_MONKEY_PATCH=1
|
|
||||||
processes = 8
|
|
||||||
|
|
||||||
# specify config file here
|
|
||||||
env = PYWB_CONFIG_FILE=/app/config.yaml
|
|
||||||
wsgi = pywb.apps.wayback
|
|
||||||
|
|
498
yarn.lock
498
yarn.lock
@ -1,498 +0,0 @@
|
|||||||
# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY.
|
|
||||||
# yarn lockfile v1
|
|
||||||
|
|
||||||
|
|
||||||
"@types/color-name@^1.1.1":
|
|
||||||
version "1.1.1"
|
|
||||||
resolved "https://registry.yarnpkg.com/@types/color-name/-/color-name-1.1.1.tgz#1c1261bbeaa10a8055bbc5d8ab84b7b2afc846a0"
|
|
||||||
integrity sha512-rr+OQyAjxze7GgWrSaJwydHStIhHq2lvY3BOC2Mj7KnzI7XK0Uw1TOOdI9lDoajEbSWLiYgoo4f1R51erQfhPQ==
|
|
||||||
|
|
||||||
"@types/node@*":
|
|
||||||
version "14.14.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/@types/node/-/node-14.14.0.tgz#f1091b6ad5de18e8e91bdbd43ec63f13de372538"
|
|
||||||
integrity sha512-BfbIHP9IapdupGhq/hc+jT5dyiBVZ2DdeC5WwJWQWDb0GijQlzUFAeIQn/2GtvZcd2HVUU7An8felIICFTC2qg==
|
|
||||||
|
|
||||||
"@types/yauzl@^2.9.1":
|
|
||||||
version "2.9.1"
|
|
||||||
resolved "https://registry.yarnpkg.com/@types/yauzl/-/yauzl-2.9.1.tgz#d10f69f9f522eef3cf98e30afb684a1e1ec923af"
|
|
||||||
integrity sha512-A1b8SU4D10uoPjwb0lnHmmu8wZhR9d+9o2PKBQT2jU5YPTKsxac6M2qGAdY7VcL+dHHhARVUDmeg0rOrcd9EjA==
|
|
||||||
dependencies:
|
|
||||||
"@types/node" "*"
|
|
||||||
|
|
||||||
abort-controller@^3.0.0:
|
|
||||||
version "3.0.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/abort-controller/-/abort-controller-3.0.0.tgz#eaf54d53b62bae4138e809ca225c8439a6efb392"
|
|
||||||
integrity sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==
|
|
||||||
dependencies:
|
|
||||||
event-target-shim "^5.0.0"
|
|
||||||
|
|
||||||
agent-base@5:
|
|
||||||
version "5.1.1"
|
|
||||||
resolved "https://registry.yarnpkg.com/agent-base/-/agent-base-5.1.1.tgz#e8fb3f242959db44d63be665db7a8e739537a32c"
|
|
||||||
integrity sha512-TMeqbNl2fMW0nMjTEPOwe3J/PRFP4vqeoNuQMG0HlMrtm5QxKqdvAkZ1pRBQ/ulIyDD5Yq0nJ7YbdD8ey0TO3g==
|
|
||||||
|
|
||||||
ansi-regex@^5.0.0:
|
|
||||||
version "5.0.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/ansi-regex/-/ansi-regex-5.0.0.tgz#388539f55179bf39339c81af30a654d69f87cb75"
|
|
||||||
integrity sha512-bY6fj56OUQ0hU1KjFNDQuJFezqKdrAyFdIevADiqrWHwSlbmBNMHp5ak2f40Pm8JTFyM2mqxkG6ngkHO11f/lg==
|
|
||||||
|
|
||||||
ansi-styles@^4.0.0:
|
|
||||||
version "4.2.1"
|
|
||||||
resolved "https://registry.yarnpkg.com/ansi-styles/-/ansi-styles-4.2.1.tgz#90ae75c424d008d2624c5bf29ead3177ebfcf359"
|
|
||||||
integrity sha512-9VGjrMsG1vePxcSweQsN20KY/c4zN0h9fLjqAbwbPfahM3t+NL+M9HC8xeXG2I8pX5NoamTGNuomEUFI7fcUjA==
|
|
||||||
dependencies:
|
|
||||||
"@types/color-name" "^1.1.1"
|
|
||||||
color-convert "^2.0.1"
|
|
||||||
|
|
||||||
balanced-match@^1.0.0:
|
|
||||||
version "1.0.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/balanced-match/-/balanced-match-1.0.0.tgz#89b4d199ab2bee49de164ea02b89ce462d71b767"
|
|
||||||
integrity sha1-ibTRmasr7kneFk6gK4nORi1xt2c=
|
|
||||||
|
|
||||||
base64-js@^1.0.2:
|
|
||||||
version "1.3.1"
|
|
||||||
resolved "https://registry.yarnpkg.com/base64-js/-/base64-js-1.3.1.tgz#58ece8cb75dd07e71ed08c736abc5fac4dbf8df1"
|
|
||||||
integrity sha512-mLQ4i2QO1ytvGWFWmcngKO//JXAQueZvwEKtjgQFM4jIK0kU+ytMfplL8j+n5mspOfjHwoAg+9yhb7BwAHm36g==
|
|
||||||
|
|
||||||
bl@^4.0.3:
|
|
||||||
version "4.0.3"
|
|
||||||
resolved "https://registry.yarnpkg.com/bl/-/bl-4.0.3.tgz#12d6287adc29080e22a705e5764b2a9522cdc489"
|
|
||||||
integrity sha512-fs4G6/Hu4/EE+F75J8DuN/0IpQqNjAdC7aEQv7Qt8MHGUH7Ckv2MwTEEeN9QehD0pfIDkMI1bkHYkKy7xHyKIg==
|
|
||||||
dependencies:
|
|
||||||
buffer "^5.5.0"
|
|
||||||
inherits "^2.0.4"
|
|
||||||
readable-stream "^3.4.0"
|
|
||||||
|
|
||||||
brace-expansion@^1.1.7:
|
|
||||||
version "1.1.11"
|
|
||||||
resolved "https://registry.yarnpkg.com/brace-expansion/-/brace-expansion-1.1.11.tgz#3c7fcbf529d87226f3d2f52b966ff5271eb441dd"
|
|
||||||
integrity sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==
|
|
||||||
dependencies:
|
|
||||||
balanced-match "^1.0.0"
|
|
||||||
concat-map "0.0.1"
|
|
||||||
|
|
||||||
buffer-crc32@~0.2.3:
|
|
||||||
version "0.2.13"
|
|
||||||
resolved "https://registry.yarnpkg.com/buffer-crc32/-/buffer-crc32-0.2.13.tgz#0d333e3f00eac50aa1454abd30ef8c2a5d9a7242"
|
|
||||||
integrity sha1-DTM+PwDqxQqhRUq9MO+MKl2ackI=
|
|
||||||
|
|
||||||
buffer@^5.2.1, buffer@^5.5.0:
|
|
||||||
version "5.6.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/buffer/-/buffer-5.6.0.tgz#a31749dc7d81d84db08abf937b6b8c4033f62786"
|
|
||||||
integrity sha512-/gDYp/UtU0eA1ys8bOs9J6a+E/KWIY+DZ+Q2WESNUA0jFRsJOc0SNUO6xJ5SGA1xueg3NL65W6s+NY5l9cunuw==
|
|
||||||
dependencies:
|
|
||||||
base64-js "^1.0.2"
|
|
||||||
ieee754 "^1.1.4"
|
|
||||||
|
|
||||||
chownr@^1.1.1:
|
|
||||||
version "1.1.4"
|
|
||||||
resolved "https://registry.yarnpkg.com/chownr/-/chownr-1.1.4.tgz#6fc9d7b42d32a583596337666e7d08084da2cc6b"
|
|
||||||
integrity sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==
|
|
||||||
|
|
||||||
cliui@^7.0.0:
|
|
||||||
version "7.0.1"
|
|
||||||
resolved "https://registry.yarnpkg.com/cliui/-/cliui-7.0.1.tgz#a4cb67aad45cd83d8d05128fc9f4d8fbb887e6b3"
|
|
||||||
integrity sha512-rcvHOWyGyid6I1WjT/3NatKj2kDt9OdSHSXpyLXaMWFbKpGACNW8pRhhdPUq9MWUOdwn8Rz9AVETjF4105rZZQ==
|
|
||||||
dependencies:
|
|
||||||
string-width "^4.2.0"
|
|
||||||
strip-ansi "^6.0.0"
|
|
||||||
wrap-ansi "^7.0.0"
|
|
||||||
|
|
||||||
color-convert@^2.0.1:
|
|
||||||
version "2.0.1"
|
|
||||||
resolved "https://registry.yarnpkg.com/color-convert/-/color-convert-2.0.1.tgz#72d3a68d598c9bdb3af2ad1e84f21d896abd4de3"
|
|
||||||
integrity sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==
|
|
||||||
dependencies:
|
|
||||||
color-name "~1.1.4"
|
|
||||||
|
|
||||||
color-name@~1.1.4:
|
|
||||||
version "1.1.4"
|
|
||||||
resolved "https://registry.yarnpkg.com/color-name/-/color-name-1.1.4.tgz#c2a09a87acbde69543de6f63fa3995c826c536a2"
|
|
||||||
integrity sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==
|
|
||||||
|
|
||||||
concat-map@0.0.1:
|
|
||||||
version "0.0.1"
|
|
||||||
resolved "https://registry.yarnpkg.com/concat-map/-/concat-map-0.0.1.tgz#d8a96bd77fd68df7793a73036a3ba0d5405d477b"
|
|
||||||
integrity sha1-2Klr13/Wjfd5OnMDajug1UBdR3s=
|
|
||||||
|
|
||||||
debug@4, debug@^4.1.0, debug@^4.1.1:
|
|
||||||
version "4.2.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/debug/-/debug-4.2.0.tgz#7f150f93920e94c58f5574c2fd01a3110effe7f1"
|
|
||||||
integrity sha512-IX2ncY78vDTjZMFUdmsvIRFY2Cf4FnD0wRs+nQwJU8Lu99/tPFdb0VybiiMTPe3I6rQmwsqQqRBvxU+bZ/I8sg==
|
|
||||||
dependencies:
|
|
||||||
ms "2.1.2"
|
|
||||||
|
|
||||||
devtools-protocol@0.0.799653:
|
|
||||||
version "0.0.799653"
|
|
||||||
resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.799653.tgz#86fc95ce5bf4fdf4b77a58047ba9d2301078f119"
|
|
||||||
integrity sha512-t1CcaZbvm8pOlikqrsIM9GOa7Ipp07+4h/q9u0JXBWjPCjHdBl9KkddX87Vv9vBHoBGtwV79sYQNGnQM6iS5gg==
|
|
||||||
|
|
||||||
emoji-regex@^8.0.0:
|
|
||||||
version "8.0.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/emoji-regex/-/emoji-regex-8.0.0.tgz#e818fd69ce5ccfcb404594f842963bf53164cc37"
|
|
||||||
integrity sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==
|
|
||||||
|
|
||||||
end-of-stream@^1.1.0, end-of-stream@^1.4.1:
|
|
||||||
version "1.4.4"
|
|
||||||
resolved "https://registry.yarnpkg.com/end-of-stream/-/end-of-stream-1.4.4.tgz#5ae64a5f45057baf3626ec14da0ca5e4b2431eb0"
|
|
||||||
integrity sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q==
|
|
||||||
dependencies:
|
|
||||||
once "^1.4.0"
|
|
||||||
|
|
||||||
escalade@^3.0.2:
|
|
||||||
version "3.1.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/escalade/-/escalade-3.1.0.tgz#e8e2d7c7a8b76f6ee64c2181d6b8151441602d4e"
|
|
||||||
integrity sha512-mAk+hPSO8fLDkhV7V0dXazH5pDc6MrjBTPyD3VeKzxnVFjH1MIxbCdqGZB9O8+EwWakZs3ZCbDS4IpRt79V1ig==
|
|
||||||
|
|
||||||
event-target-shim@^5.0.0:
|
|
||||||
version "5.0.1"
|
|
||||||
resolved "https://registry.yarnpkg.com/event-target-shim/-/event-target-shim-5.0.1.tgz#5d4d3ebdf9583d63a5333ce2deb7480ab2b05789"
|
|
||||||
integrity sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==
|
|
||||||
|
|
||||||
extract-zip@^2.0.0:
|
|
||||||
version "2.0.1"
|
|
||||||
resolved "https://registry.yarnpkg.com/extract-zip/-/extract-zip-2.0.1.tgz#663dca56fe46df890d5f131ef4a06d22bb8ba13a"
|
|
||||||
integrity sha512-GDhU9ntwuKyGXdZBUgTIe+vXnWj0fppUEtMDL0+idd5Sta8TGpHssn/eusA9mrPr9qNDym6SxAYZjNvCn/9RBg==
|
|
||||||
dependencies:
|
|
||||||
debug "^4.1.1"
|
|
||||||
get-stream "^5.1.0"
|
|
||||||
yauzl "^2.10.0"
|
|
||||||
optionalDependencies:
|
|
||||||
"@types/yauzl" "^2.9.1"
|
|
||||||
|
|
||||||
fd-slicer@~1.1.0:
|
|
||||||
version "1.1.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/fd-slicer/-/fd-slicer-1.1.0.tgz#25c7c89cb1f9077f8891bbe61d8f390eae256f1e"
|
|
||||||
integrity sha1-JcfInLH5B3+IkbvmHY85Dq4lbx4=
|
|
||||||
dependencies:
|
|
||||||
pend "~1.2.0"
|
|
||||||
|
|
||||||
find-up@^4.0.0:
|
|
||||||
version "4.1.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/find-up/-/find-up-4.1.0.tgz#97afe7d6cdc0bc5928584b7c8d7b16e8a9aa5d19"
|
|
||||||
integrity sha512-PpOwAdQ/YlXQ2vj8a3h8IipDuYRi3wceVQQGYWxNINccq40Anw7BlsEXCMbt1Zt+OLA6Fq9suIpIWD0OsnISlw==
|
|
||||||
dependencies:
|
|
||||||
locate-path "^5.0.0"
|
|
||||||
path-exists "^4.0.0"
|
|
||||||
|
|
||||||
fs-constants@^1.0.0:
|
|
||||||
version "1.0.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/fs-constants/-/fs-constants-1.0.0.tgz#6be0de9be998ce16af8afc24497b9ee9b7ccd9ad"
|
|
||||||
integrity sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==
|
|
||||||
|
|
||||||
fs.realpath@^1.0.0:
|
|
||||||
version "1.0.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/fs.realpath/-/fs.realpath-1.0.0.tgz#1504ad2523158caa40db4a2787cb01411994ea4f"
|
|
||||||
integrity sha1-FQStJSMVjKpA20onh8sBQRmU6k8=
|
|
||||||
|
|
||||||
get-caller-file@^2.0.5:
|
|
||||||
version "2.0.5"
|
|
||||||
resolved "https://registry.yarnpkg.com/get-caller-file/-/get-caller-file-2.0.5.tgz#4f94412a82db32f36e3b0b9741f8a97feb031f7e"
|
|
||||||
integrity sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==
|
|
||||||
|
|
||||||
get-stream@^5.1.0:
|
|
||||||
version "5.2.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/get-stream/-/get-stream-5.2.0.tgz#4966a1795ee5ace65e706c4b7beb71257d6e22d3"
|
|
||||||
integrity sha512-nBF+F1rAZVCu/p7rjzgA+Yb4lfYXrpl7a6VmJrU8wF9I1CKvP/QwPNZHnOlwbTkY6dvtFIzFMSyQXbLoTQPRpA==
|
|
||||||
dependencies:
|
|
||||||
pump "^3.0.0"
|
|
||||||
|
|
||||||
glob@^7.1.3:
|
|
||||||
version "7.1.6"
|
|
||||||
resolved "https://registry.yarnpkg.com/glob/-/glob-7.1.6.tgz#141f33b81a7c2492e125594307480c46679278a6"
|
|
||||||
integrity sha512-LwaxwyZ72Lk7vZINtNNrywX0ZuLyStrdDtabefZKAY5ZGJhVtgdznluResxNmPitE0SAO+O26sWTHeKSI2wMBA==
|
|
||||||
dependencies:
|
|
||||||
fs.realpath "^1.0.0"
|
|
||||||
inflight "^1.0.4"
|
|
||||||
inherits "2"
|
|
||||||
minimatch "^3.0.4"
|
|
||||||
once "^1.3.0"
|
|
||||||
path-is-absolute "^1.0.0"
|
|
||||||
|
|
||||||
https-proxy-agent@^4.0.0:
|
|
||||||
version "4.0.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/https-proxy-agent/-/https-proxy-agent-4.0.0.tgz#702b71fb5520a132a66de1f67541d9e62154d82b"
|
|
||||||
integrity sha512-zoDhWrkR3of1l9QAL8/scJZyLu8j/gBkcwcaQOZh7Gyh/+uJQzGVETdgT30akuwkpL8HTRfssqI3BZuV18teDg==
|
|
||||||
dependencies:
|
|
||||||
agent-base "5"
|
|
||||||
debug "4"
|
|
||||||
|
|
||||||
ieee754@^1.1.4:
|
|
||||||
version "1.1.13"
|
|
||||||
resolved "https://registry.yarnpkg.com/ieee754/-/ieee754-1.1.13.tgz#ec168558e95aa181fd87d37f55c32bbcb6708b84"
|
|
||||||
integrity sha512-4vf7I2LYV/HaWerSo3XmlMkp5eZ83i+/CDluXi/IGTs/O1sejBNhTtnxzmRZfvOUqj7lZjqHkeTvpgSFDlWZTg==
|
|
||||||
|
|
||||||
inflight@^1.0.4:
|
|
||||||
version "1.0.6"
|
|
||||||
resolved "https://registry.yarnpkg.com/inflight/-/inflight-1.0.6.tgz#49bd6331d7d02d0c09bc910a1075ba8165b56df9"
|
|
||||||
integrity sha1-Sb1jMdfQLQwJvJEKEHW6gWW1bfk=
|
|
||||||
dependencies:
|
|
||||||
once "^1.3.0"
|
|
||||||
wrappy "1"
|
|
||||||
|
|
||||||
inherits@2, inherits@^2.0.3, inherits@^2.0.4:
|
|
||||||
version "2.0.4"
|
|
||||||
resolved "https://registry.yarnpkg.com/inherits/-/inherits-2.0.4.tgz#0fa2c64f932917c3433a0ded55363aae37416b7c"
|
|
||||||
integrity sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==
|
|
||||||
|
|
||||||
is-fullwidth-code-point@^3.0.0:
|
|
||||||
version "3.0.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz#f116f8064fe90b3f7844a38997c0b75051269f1d"
|
|
||||||
integrity sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==
|
|
||||||
|
|
||||||
locate-path@^5.0.0:
|
|
||||||
version "5.0.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/locate-path/-/locate-path-5.0.0.tgz#1afba396afd676a6d42504d0a67a3a7eb9f62aa0"
|
|
||||||
integrity sha512-t7hw9pI+WvuwNJXwk5zVHpyhIqzg2qTlklJOf0mVxGSbe3Fp2VieZcduNYjaLDoy6p9uGpQEGWG87WpMKlNq8g==
|
|
||||||
dependencies:
|
|
||||||
p-locate "^4.1.0"
|
|
||||||
|
|
||||||
minimatch@^3.0.4:
|
|
||||||
version "3.0.4"
|
|
||||||
resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-3.0.4.tgz#5166e286457f03306064be5497e8dbb0c3d32083"
|
|
||||||
integrity sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA==
|
|
||||||
dependencies:
|
|
||||||
brace-expansion "^1.1.7"
|
|
||||||
|
|
||||||
mkdirp-classic@^0.5.2:
|
|
||||||
version "0.5.3"
|
|
||||||
resolved "https://registry.yarnpkg.com/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz#fa10c9115cc6d8865be221ba47ee9bed78601113"
|
|
||||||
integrity sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==
|
|
||||||
|
|
||||||
ms@2.1.2:
|
|
||||||
version "2.1.2"
|
|
||||||
resolved "https://registry.yarnpkg.com/ms/-/ms-2.1.2.tgz#d09d1f357b443f493382a8eb3ccd183872ae6009"
|
|
||||||
integrity sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==
|
|
||||||
|
|
||||||
node-fetch@^2.6.1:
|
|
||||||
version "2.6.1"
|
|
||||||
resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.1.tgz#045bd323631f76ed2e2b55573394416b639a0052"
|
|
||||||
integrity sha512-V4aYg89jEoVRxRb2fJdAg8FHvI7cEyYdVAh94HH0UIK8oJxUfkjlDQN9RbMx+bEjP7+ggMiFRprSti032Oipxw==
|
|
||||||
|
|
||||||
once@^1.3.0, once@^1.3.1, once@^1.4.0:
|
|
||||||
version "1.4.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/once/-/once-1.4.0.tgz#583b1aa775961d4b113ac17d9c50baef9dd76bd1"
|
|
||||||
integrity sha1-WDsap3WWHUsROsF9nFC6753Xa9E=
|
|
||||||
dependencies:
|
|
||||||
wrappy "1"
|
|
||||||
|
|
||||||
p-limit@^2.2.0:
|
|
||||||
version "2.3.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/p-limit/-/p-limit-2.3.0.tgz#3dd33c647a214fdfffd835933eb086da0dc21db1"
|
|
||||||
integrity sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w==
|
|
||||||
dependencies:
|
|
||||||
p-try "^2.0.0"
|
|
||||||
|
|
||||||
p-locate@^4.1.0:
|
|
||||||
version "4.1.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/p-locate/-/p-locate-4.1.0.tgz#a3428bb7088b3a60292f66919278b7c297ad4f07"
|
|
||||||
integrity sha512-R79ZZ/0wAxKGu3oYMlz8jy/kbhsNrS7SKZ7PxEHBgJ5+F2mtFW2fK2cOtBh1cHYkQsbzFV7I+EoRKe6Yt0oK7A==
|
|
||||||
dependencies:
|
|
||||||
p-limit "^2.2.0"
|
|
||||||
|
|
||||||
p-try@^2.0.0:
|
|
||||||
version "2.2.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/p-try/-/p-try-2.2.0.tgz#cb2868540e313d61de58fafbe35ce9004d5540e6"
|
|
||||||
integrity sha512-R4nPAVTAU0B9D35/Gk3uJf/7XYbQcyohSKdvAxIRSNghFl4e71hVoGnBNQz9cWaXxO2I10KTC+3jMdvvoKw6dQ==
|
|
||||||
|
|
||||||
path-exists@^4.0.0:
|
|
||||||
version "4.0.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/path-exists/-/path-exists-4.0.0.tgz#513bdbe2d3b95d7762e8c1137efa195c6c61b5b3"
|
|
||||||
integrity sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w==
|
|
||||||
|
|
||||||
path-is-absolute@^1.0.0:
|
|
||||||
version "1.0.1"
|
|
||||||
resolved "https://registry.yarnpkg.com/path-is-absolute/-/path-is-absolute-1.0.1.tgz#174b9268735534ffbc7ace6bf53a5a9e1b5c5f5f"
|
|
||||||
integrity sha1-F0uSaHNVNP+8es5r9TpanhtcX18=
|
|
||||||
|
|
||||||
pend@~1.2.0:
|
|
||||||
version "1.2.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/pend/-/pend-1.2.0.tgz#7a57eb550a6783f9115331fcf4663d5c8e007a50"
|
|
||||||
integrity sha1-elfrVQpng/kRUzH89GY9XI4AelA=
|
|
||||||
|
|
||||||
pkg-dir@^4.2.0:
|
|
||||||
version "4.2.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/pkg-dir/-/pkg-dir-4.2.0.tgz#f099133df7ede422e81d1d8448270eeb3e4261f3"
|
|
||||||
integrity sha512-HRDzbaKjC+AOWVXxAU/x54COGeIv9eb+6CkDSQoNTt4XyWoIJvuPsXizxu/Fr23EiekbtZwmh1IcIG/l/a10GQ==
|
|
||||||
dependencies:
|
|
||||||
find-up "^4.0.0"
|
|
||||||
|
|
||||||
progress@^2.0.1:
|
|
||||||
version "2.0.3"
|
|
||||||
resolved "https://registry.yarnpkg.com/progress/-/progress-2.0.3.tgz#7e8cf8d8f5b8f239c1bc68beb4eb78567d572ef8"
|
|
||||||
integrity sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==
|
|
||||||
|
|
||||||
proxy-from-env@^1.0.0:
|
|
||||||
version "1.1.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/proxy-from-env/-/proxy-from-env-1.1.0.tgz#e102f16ca355424865755d2c9e8ea4f24d58c3e2"
|
|
||||||
integrity sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==
|
|
||||||
|
|
||||||
pump@^3.0.0:
|
|
||||||
version "3.0.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/pump/-/pump-3.0.0.tgz#b4a2116815bde2f4e1ea602354e8c75565107a64"
|
|
||||||
integrity sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==
|
|
||||||
dependencies:
|
|
||||||
end-of-stream "^1.1.0"
|
|
||||||
once "^1.3.1"
|
|
||||||
|
|
||||||
puppeteer-cluster@^0.22.0:
|
|
||||||
version "0.22.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/puppeteer-cluster/-/puppeteer-cluster-0.22.0.tgz#4ab214671f414f15ad6a94a4b61ed0b4172e86e6"
|
|
||||||
integrity sha512-hmydtMwfVM+idFIDzS8OXetnujHGre7RY3BGL+3njy9+r8Dcu3VALkZHfuBEPf6byKssTCgzxU1BvLczifXd5w==
|
|
||||||
dependencies:
|
|
||||||
debug "^4.1.1"
|
|
||||||
|
|
||||||
puppeteer-core@^5.3.1:
|
|
||||||
version "5.3.1"
|
|
||||||
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-5.3.1.tgz#1affb1738afac499416a7fd4ed2ed0c18577e88f"
|
|
||||||
integrity sha512-YE6c6FvHAFKQUyNTqFs78SgGmpcqOPhhmVfEVNYB4abv7bV2V+B3r72T3e7vlJkEeTloy4x9bQLrGbHHoKSg1w==
|
|
||||||
dependencies:
|
|
||||||
debug "^4.1.0"
|
|
||||||
devtools-protocol "0.0.799653"
|
|
||||||
extract-zip "^2.0.0"
|
|
||||||
https-proxy-agent "^4.0.0"
|
|
||||||
pkg-dir "^4.2.0"
|
|
||||||
progress "^2.0.1"
|
|
||||||
proxy-from-env "^1.0.0"
|
|
||||||
rimraf "^3.0.2"
|
|
||||||
tar-fs "^2.0.0"
|
|
||||||
unbzip2-stream "^1.3.3"
|
|
||||||
ws "^7.2.3"
|
|
||||||
|
|
||||||
readable-stream@^3.1.1, readable-stream@^3.4.0:
|
|
||||||
version "3.6.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/readable-stream/-/readable-stream-3.6.0.tgz#337bbda3adc0706bd3e024426a286d4b4b2c9198"
|
|
||||||
integrity sha512-BViHy7LKeTz4oNnkcLJ+lVSL6vpiFeX6/d3oSH8zCW7UxP2onchk+vTGB143xuFjHS3deTgkKoXXymXqymiIdA==
|
|
||||||
dependencies:
|
|
||||||
inherits "^2.0.3"
|
|
||||||
string_decoder "^1.1.1"
|
|
||||||
util-deprecate "^1.0.1"
|
|
||||||
|
|
||||||
require-directory@^2.1.1:
|
|
||||||
version "2.1.1"
|
|
||||||
resolved "https://registry.yarnpkg.com/require-directory/-/require-directory-2.1.1.tgz#8c64ad5fd30dab1c976e2344ffe7f792a6a6df42"
|
|
||||||
integrity sha1-jGStX9MNqxyXbiNE/+f3kqam30I=
|
|
||||||
|
|
||||||
rimraf@^3.0.2:
|
|
||||||
version "3.0.2"
|
|
||||||
resolved "https://registry.yarnpkg.com/rimraf/-/rimraf-3.0.2.tgz#f1a5402ba6220ad52cc1282bac1ae3aa49fd061a"
|
|
||||||
integrity sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==
|
|
||||||
dependencies:
|
|
||||||
glob "^7.1.3"
|
|
||||||
|
|
||||||
safe-buffer@~5.2.0:
|
|
||||||
version "5.2.1"
|
|
||||||
resolved "https://registry.yarnpkg.com/safe-buffer/-/safe-buffer-5.2.1.tgz#1eaf9fa9bdb1fdd4ec75f58f9cdb4e6b7827eec6"
|
|
||||||
integrity sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==
|
|
||||||
|
|
||||||
string-width@^4.1.0, string-width@^4.2.0:
|
|
||||||
version "4.2.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.0.tgz#952182c46cc7b2c313d1596e623992bd163b72b5"
|
|
||||||
integrity sha512-zUz5JD+tgqtuDjMhwIg5uFVV3dtqZ9yQJlZVfq4I01/K5Paj5UHj7VyrQOJvzawSVlKpObApbfD0Ed6yJc+1eg==
|
|
||||||
dependencies:
|
|
||||||
emoji-regex "^8.0.0"
|
|
||||||
is-fullwidth-code-point "^3.0.0"
|
|
||||||
strip-ansi "^6.0.0"
|
|
||||||
|
|
||||||
string_decoder@^1.1.1:
|
|
||||||
version "1.3.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/string_decoder/-/string_decoder-1.3.0.tgz#42f114594a46cf1a8e30b0a84f56c78c3edac21e"
|
|
||||||
integrity sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==
|
|
||||||
dependencies:
|
|
||||||
safe-buffer "~5.2.0"
|
|
||||||
|
|
||||||
strip-ansi@^6.0.0:
|
|
||||||
version "6.0.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.0.tgz#0b1571dd7669ccd4f3e06e14ef1eed26225ae532"
|
|
||||||
integrity sha512-AuvKTrTfQNYNIctbR1K/YGTR1756GycPsg7b9bdV9Duqur4gv6aKqHXah67Z8ImS7WEz5QVcOtlfW2rZEugt6w==
|
|
||||||
dependencies:
|
|
||||||
ansi-regex "^5.0.0"
|
|
||||||
|
|
||||||
tar-fs@^2.0.0:
|
|
||||||
version "2.1.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/tar-fs/-/tar-fs-2.1.0.tgz#d1cdd121ab465ee0eb9ccde2d35049d3f3daf0d5"
|
|
||||||
integrity sha512-9uW5iDvrIMCVpvasdFHW0wJPez0K4JnMZtsuIeDI7HyMGJNxmDZDOCQROr7lXyS+iL/QMpj07qcjGYTSdRFXUg==
|
|
||||||
dependencies:
|
|
||||||
chownr "^1.1.1"
|
|
||||||
mkdirp-classic "^0.5.2"
|
|
||||||
pump "^3.0.0"
|
|
||||||
tar-stream "^2.0.0"
|
|
||||||
|
|
||||||
tar-stream@^2.0.0:
|
|
||||||
version "2.1.4"
|
|
||||||
resolved "https://registry.yarnpkg.com/tar-stream/-/tar-stream-2.1.4.tgz#c4fb1a11eb0da29b893a5b25476397ba2d053bfa"
|
|
||||||
integrity sha512-o3pS2zlG4gxr67GmFYBLlq+dM8gyRGUOvsrHclSkvtVtQbjV0s/+ZE8OpICbaj8clrX3tjeHngYGP7rweaBnuw==
|
|
||||||
dependencies:
|
|
||||||
bl "^4.0.3"
|
|
||||||
end-of-stream "^1.4.1"
|
|
||||||
fs-constants "^1.0.0"
|
|
||||||
inherits "^2.0.3"
|
|
||||||
readable-stream "^3.1.1"
|
|
||||||
|
|
||||||
through@^2.3.8:
|
|
||||||
version "2.3.8"
|
|
||||||
resolved "https://registry.yarnpkg.com/through/-/through-2.3.8.tgz#0dd4c9ffaabc357960b1b724115d7e0e86a2e1f5"
|
|
||||||
integrity sha1-DdTJ/6q8NXlgsbckEV1+Doai4fU=
|
|
||||||
|
|
||||||
unbzip2-stream@^1.3.3:
|
|
||||||
version "1.4.3"
|
|
||||||
resolved "https://registry.yarnpkg.com/unbzip2-stream/-/unbzip2-stream-1.4.3.tgz#b0da04c4371311df771cdc215e87f2130991ace7"
|
|
||||||
integrity sha512-mlExGW4w71ebDJviH16lQLtZS32VKqsSfk80GCfUlwT/4/hNRFsoscrF/c++9xinkMzECL1uL9DDwXqFWkruPg==
|
|
||||||
dependencies:
|
|
||||||
buffer "^5.2.1"
|
|
||||||
through "^2.3.8"
|
|
||||||
|
|
||||||
util-deprecate@^1.0.1:
|
|
||||||
version "1.0.2"
|
|
||||||
resolved "https://registry.yarnpkg.com/util-deprecate/-/util-deprecate-1.0.2.tgz#450d4dc9fa70de732762fbd2d4a28981419a0ccf"
|
|
||||||
integrity sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8=
|
|
||||||
|
|
||||||
wrap-ansi@^7.0.0:
|
|
||||||
version "7.0.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43"
|
|
||||||
integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==
|
|
||||||
dependencies:
|
|
||||||
ansi-styles "^4.0.0"
|
|
||||||
string-width "^4.1.0"
|
|
||||||
strip-ansi "^6.0.0"
|
|
||||||
|
|
||||||
wrappy@1:
|
|
||||||
version "1.0.2"
|
|
||||||
resolved "https://registry.yarnpkg.com/wrappy/-/wrappy-1.0.2.tgz#b5243d8f3ec1aa35f1364605bc0d1036e30ab69f"
|
|
||||||
integrity sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8=
|
|
||||||
|
|
||||||
ws@^7.2.3:
|
|
||||||
version "7.3.1"
|
|
||||||
resolved "https://registry.yarnpkg.com/ws/-/ws-7.3.1.tgz#d0547bf67f7ce4f12a72dfe31262c68d7dc551c8"
|
|
||||||
integrity sha512-D3RuNkynyHmEJIpD2qrgVkc9DQ23OrN/moAwZX4L8DfvszsJxpjQuUq3LMx6HoYji9fbIOBY18XWBsAux1ZZUA==
|
|
||||||
|
|
||||||
y18n@^5.0.1:
|
|
||||||
version "5.0.1"
|
|
||||||
resolved "https://registry.yarnpkg.com/y18n/-/y18n-5.0.1.tgz#1ad2a7eddfa8bce7caa2e1f6b5da96c39d99d571"
|
|
||||||
integrity sha512-/jJ831jEs4vGDbYPQp4yGKDYPSCCEQ45uZWJHE1AoYBzqdZi8+LDWas0z4HrmJXmKdpFsTiowSHXdxyFhpmdMg==
|
|
||||||
|
|
||||||
yargs-parser@^20.0.0:
|
|
||||||
version "20.0.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/yargs-parser/-/yargs-parser-20.0.0.tgz#c65a1daaa977ad63cebdd52159147b789a4e19a9"
|
|
||||||
integrity sha512-8eblPHTL7ZWRkyjIZJjnGf+TijiKJSwA24svzLRVvtgoi/RZiKa9fFQTrlx0OKLnyHSdt/enrdadji6WFfESVA==
|
|
||||||
|
|
||||||
yargs@^16.0.3:
|
|
||||||
version "16.0.3"
|
|
||||||
resolved "https://registry.yarnpkg.com/yargs/-/yargs-16.0.3.tgz#7a919b9e43c90f80d4a142a89795e85399a7e54c"
|
|
||||||
integrity sha512-6+nLw8xa9uK1BOEOykaiYAJVh6/CjxWXK/q9b5FpRgNslt8s22F2xMBqVIKgCRjNgGvGPBy8Vog7WN7yh4amtA==
|
|
||||||
dependencies:
|
|
||||||
cliui "^7.0.0"
|
|
||||||
escalade "^3.0.2"
|
|
||||||
get-caller-file "^2.0.5"
|
|
||||||
require-directory "^2.1.1"
|
|
||||||
string-width "^4.2.0"
|
|
||||||
y18n "^5.0.1"
|
|
||||||
yargs-parser "^20.0.0"
|
|
||||||
|
|
||||||
yauzl@^2.10.0:
|
|
||||||
version "2.10.0"
|
|
||||||
resolved "https://registry.yarnpkg.com/yauzl/-/yauzl-2.10.0.tgz#c7eb17c93e112cb1086fa6d8e51fb0667b79a5f9"
|
|
||||||
integrity sha1-x+sXyT4RLLEIb6bY5R+wZnt5pfk=
|
|
||||||
dependencies:
|
|
||||||
buffer-crc32 "~0.2.3"
|
|
||||||
fd-slicer "~1.1.0"
|
|
26
zimit.py
26
zimit.py
@ -114,31 +114,15 @@ def zimit(args=None):
|
|||||||
|
|
||||||
atexit.register(cleanup)
|
atexit.register(cleanup)
|
||||||
|
|
||||||
# create pywb collection
|
|
||||||
print("")
|
|
||||||
print("----------")
|
|
||||||
print("pywb init")
|
|
||||||
subprocess.run(
|
|
||||||
["/usr/bin/env", "wb-manager", "init", "capture"], check=True, cwd=temp_root_dir
|
|
||||||
) # nosec
|
|
||||||
|
|
||||||
subprocess.Popen(
|
|
||||||
["/usr/bin/env", "redis-server"], cwd=temp_root_dir, stdout=subprocess.DEVNULL
|
|
||||||
) # nosec
|
|
||||||
|
|
||||||
subprocess.Popen(
|
|
||||||
["/usr/bin/env", "uwsgi", os.getcwd() + "/uwsgi.ini"],
|
|
||||||
cwd=temp_root_dir,
|
|
||||||
stdout=subprocess.DEVNULL,
|
|
||||||
stderr=subprocess.DEVNULL,
|
|
||||||
) # nosec
|
|
||||||
|
|
||||||
cmd_args = get_node_cmd_line(zimit_args)
|
cmd_args = get_node_cmd_line(zimit_args)
|
||||||
|
cmd_args.append("--cwd")
|
||||||
|
cmd_args.append(str(temp_root_dir))
|
||||||
|
|
||||||
cmd_line = " ".join(cmd_args)
|
cmd_line = " ".join(cmd_args)
|
||||||
|
|
||||||
print("")
|
print("")
|
||||||
print("----------")
|
print("----------")
|
||||||
print("running zimit driver: " + cmd_line)
|
print("running browsertrix-crawler crawl: " + cmd_line)
|
||||||
subprocess.run(cmd_args, check=True)
|
subprocess.run(cmd_args, check=True)
|
||||||
|
|
||||||
warc_files = temp_root_dir / "collections" / "capture" / "archive"
|
warc_files = temp_root_dir / "collections" / "capture" / "archive"
|
||||||
@ -153,7 +137,7 @@ def zimit(args=None):
|
|||||||
return warc2zim(warc2zim_args)
|
return warc2zim(warc2zim_args)
|
||||||
|
|
||||||
def get_node_cmd_line(args):
|
def get_node_cmd_line(args):
|
||||||
node_cmd = ["/usr/bin/env", "node", "crawler.js"]
|
node_cmd = ["crawl"]
|
||||||
for arg in [
|
for arg in [
|
||||||
"url",
|
"url",
|
||||||
"workers",
|
"workers",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user