Merge pull request #46 from openzim/crawler-split

Split zimit from webrecorder/browsertrix-crawler
This commit is contained in:
rgaudin 2020-11-10 09:16:46 +00:00 committed by GitHub
commit 0e3af5124b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 72 additions and 1154 deletions

19
.github/workflows/ci.yaml vendored Normal file
View File

@ -0,0 +1,19 @@
name: CI
on: [push, pull_request]
jobs:
integration-tests:
runs-on: ubuntu-latest
steps:
- name: checkout
uses: actions/checkout@v2
- name: build image
run: docker build -t openzim/zimit:dev .
- name: run crawl
run: docker run -v $PWD/output:/output openzim/zimit:dev zimit --url http://isago.ml/ --name isago --zim-file isago.zim
- name: ensure zim exists
run: stat ./output/isago.zim

1
.gitignore vendored
View File

@ -4,3 +4,4 @@ __pycache__
*.egg-info/
collections/
node_modules/
output/

View File

@ -1,40 +1,12 @@
FROM oldwebtoday/chrome:84 as chrome
FROM webrecorder/browsertrix-crawler:0.1.1
FROM nikolaik/python-nodejs:python3.8-nodejs14
RUN apt-get update -y \
&& apt-get install --no-install-recommends -qqy fonts-stix locales-all redis-server \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
ENV PROXY_HOST=localhost \
PROXY_PORT=8080 \
PROXY_CA_URL=http://wsgiprox/download/pem \
PROXY_CA_FILE=/tmp/proxy-ca.pem \
NO_SOCAT=1
RUN pip install gevent>=20.9.0 uwsgi
#RUN pip install git+https://github.com/openzim/warc2zim@fuzzy-match
RUN pip install 'warc2zim>=1.3.0'
RUN pip install git+https://github.com/webrecorder/pywb@patch-work
COPY --from=chrome /usr/lib/x86_64-linux-gnu/ /usr/lib/x86_64-linux-gnu/
COPY --from=chrome /lib/x86_64-linux-gnu/libdbus* /lib/x86_64-linux-gnu/
COPY --from=chrome /opt/google/chrome/ /opt/google/chrome/
RUN mkdir -p /output
WORKDIR /app
ADD package.json /app/
RUN pip install 'warc2zim>=1.3.1' 'requests>=2.24.0'
RUN yarn install
ADD config.yaml /app/
ADD uwsgi.ini /app/
ADD zimit.py /app/
ADD crawler.js /app/
ADD autoplay.js /app/
RUN ln -s /app/zimit.py /usr/bin/zimit

View File

@ -12,17 +12,16 @@ Technical background
This version of Zimit runs a single-site headless-Chrome based crawl in a Docker container and produces a ZIM of the crawled content.
The system uses:
- `oldwebtoday/chrome` - to install a recent version of Chrome 84
- `puppeteer-cluster` - for running Chrome browsers in parallel
- `pywb` - in recording mode for capturing the content
- `warc2zim` - to convert the crawled WARC files into a ZIM
The system extends the crawling system in [Browsertrix Crawler](https://github.com/webrecorder/browsertrix-crawler) and converts
the crawled WARC files to ZIM using [warc2zim](https://github.com/openzim/warc2zim)
The driver in `index.js` crawls a given URL using puppeteer-cluster.
The `zimit.py` is the entrypoint for the system.
After the crawl is done, warc2zim is used to write a zim to the
`/output` directory, which can be mounted as a volume.
Using the `--keep` flag, the crawled WARCs will also be kept in a temp directory inside `/output`
Usage
-----
@ -44,6 +43,7 @@ The image accepts the following parameters:
- `--limit U` - Limit capture to at most U URLs
- `--exclude <regex>` - skip URLs that match the regex from crawling. Can be specified multiple times.
- `--scroll [N]` - if set, will activate a simple auto-scroll behavior on each page to scroll for upto N seconds
- `--keep` - if set, keep the WARC files in a temp directory inside the output directory
The following is an example usage. The `--cap-add` and `--shm-size`
flags are [needed to run Chrome in Docker](https://github.com/puppeteer/puppeteer/blob/v1.0.0/docs/troubleshooting.md#tips).
@ -52,7 +52,7 @@ Example command:
```bash
docker run -v /output:/output --cap-add=SYS_ADMIN --cap-add=NET_ADMIN \
--shm-size=1gb openzim/zimit URL --name myzimfile --workers 2 --wait-until domcontentloaded
--shm-size=1gb openzim/zimit zimit --url URL --name myzimfile --workers 2 --waitUntil domcontentloaded
```
The puppeteer-cluster provides monitoring output which is enabled by

View File

@ -1,89 +0,0 @@
(() => {
function run() {
if (window.navigator.__crawler_autoplay) {
return;
}
//console.log("checking autoplay for " + document.location.href);
window.navigator.__crawler_autoplay = true;
const specialActions = [
{
rx: /w\.soundcloud\.com/,
check(url) {
const autoplay = url.searchParams.get('auto_play');
return autoplay === 'true';
},
handle(url) {
url.searchParams.set('auto_play', 'true');
// set continuous_play to true in order to handle
// a playlist etc
url.searchParams.set('continuous_play', 'true');
self.location.href = url.href;
},
},
{
rx: [/player\.vimeo\.com/, /youtube\.com\/embed\//],
check(url) {
const autoplay = url.searchParams.get('autoplay');
return autoplay === '1';
},
handle(url) {
url.searchParams.set('autoplay', '1');
if (window.__crawler_autoplayLoad) {
window.__crawler_autoplayLoad(url.href);
}
self.location.href = url.href;
},
},
];
const url = new URL(self.location.href);
for (let i = 0; i < specialActions.length; i++) {
if (Array.isArray(specialActions[i].rx)) {
const rxs = specialActions[i].rx;
for (let j = 0; j < rxs.length; j++) {
if (url.href.search(rxs[j]) >= 0) {
if (specialActions[i].check(url)) return;
return specialActions[i].handle(url);
}
}
} else if (url.href.search(specialActions[i].rx) >= 0) {
if (specialActions[i].check(url)) return;
return specialActions[i].handle(url);
}
}
}
document.addEventListener("readystatechange", run);
if (document.readyState === "complete") {
run();
}
const mediaSet = new Set();
setInterval(() => {
const medias = document.querySelectorAll("video, audio");
for (const media of medias) {
try {
if (media.src && !mediaSet.has(media.src)) {
if (window.__crawler_queueUrls && (media.src.startsWith("http:") || media.src.startsWith("https:"))) {
window.__crawler_queueUrls(media.src);
}
mediaSet.add(media.src);
} else if (!media.src) {
media.play();
}
} catch(e) {
console.log(e);
}
}
}, 3000);
})();

View File

@ -1,21 +0,0 @@
debug: true
proxy:
coll: capture
recording: true
enable_banner: false
enable_wombat: true
recorder:
dedup_policy: skip
source_coll: live
cache: always
#autoindex: 10
enable_auto_fetch: true
collections:
live: $live

View File

@ -1,436 +0,0 @@
const fs = require("fs");
const puppeteer = require("puppeteer-core");
const { Cluster } = require("puppeteer-cluster");
const child_process = require("child_process");
const fetch = require("node-fetch");
const AbortController = require("abort-controller");
const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
const NEW_CONTEXT_OPTS = ["page", "session", "browser"];
const CHROME_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36";
// to ignore HTTPS error for HEAD check
const HTTPS_AGENT = require("https").Agent({
rejectUnauthorized: false,
});
process.once('SIGINT', (code) => {
console.log('SIGINT received, exiting');
process.exit(1);
});
process.once('SIGTERM', (code) => {
console.log('SIGTERM received, exiting');
process.exit(1);
});
const autoplayScript = fs.readFileSync("./autoplay.js", "utf-8");
// prefix for direct capture via pywb
const capturePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/capture/record/id_/`;
const headers = {"User-Agent": CHROME_USER_AGENT};
async function run(params) {
// Chrome Flags, including proxy server
const args = [
"--no-xshm", // needed for Chrome >80 (check if puppeteer adds automatically)
`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`,
"--no-sandbox",
"--disable-background-media-suspend",
"--autoplay-policy=no-user-gesture-required",
];
// Puppeter Options
const puppeteerOptions = {
headless: true,
executablePath: "/opt/google/chrome/google-chrome",
ignoreHTTPSErrors: true,
args
};
// params
const { url, waitUntil, timeout, scope, limit, exclude, scroll, newContext } = params;
let concurrency = Cluster.CONCURRENCY_PAGE;
switch (newContext) {
case "page":
concurrency = Cluster.CONCURRENCY_PAGE;
break;
case "session":
concurrency = Cluster.CONCURRENCY_CONTEXT;
break;
case "browser":
concurrency = Cluster.CONCURRENCY_BROWSER;
break;
}
// Puppeteer Cluster init and options
const cluster = await Cluster.launch({
concurrency,
maxConcurrency: Number(params.workers) || 1,
skipDuplicateUrls: true,
// total timeout for cluster
timeout: timeout * 2,
puppeteerOptions,
puppeteer,
monitor: true
});
// Maintain own seen list
const seenList = new Set();
//console.log("Limit: " + limit);
// links crawled counter
let numLinks = 0;
// Crawl Task
cluster.task(async ({page, data}) => {
const {url} = data;
if (!await htmlCheck(url, capturePrefix)) {
return;
}
//page.on('console', message => console.log(`${message.type()} ${message.text()}`));
//page.on('pageerror', message => console.warn(message));
//page.on('error', message => console.warn(message));
//page.on('requestfailed', message => console.warn(message._failureText));
const mediaResults = [];
await page.exposeFunction('__crawler_queueUrls', (url) => {
mediaResults.push(directCapture(url));
});
let waitForVideo = false;
await page.exposeFunction('__crawler_autoplayLoad', (url) => {
console.log("*** Loading autoplay URL: " + url);
waitForVideo = true;
});
try {
await page.evaluateOnNewDocument(autoplayScript);
} catch(e) {
console.log(e);
}
try {
await page.goto(url, {waitUntil, timeout});
} catch (e) {
console.log(`Load timeout for ${url}`);
}
try {
await Promise.all(mediaResults);
} catch (e) {
console.log(`Error loading media URLs`, e);
}
if (waitForVideo) {
console.log("Extra wait 15s for video loading");
await sleep(15000);
}
if (scroll) {
try {
await Promise.race([page.evaluate(autoScroll), sleep(30000)]);
} catch (e) {
console.warn("Behavior Failed", e);
}
}
let results = null;
try {
results = await page.evaluate(() => {
return [...document.querySelectorAll('a[href]')].map(el => ({ url: el.href}))
});
} catch (e) {
console.warn("Link Extraction failed", e);
return;
}
try {
for (data of results) {
const newUrl = shouldCrawl(scope, seenList, data.url, exclude);
if (newUrl) {
seenList.add(newUrl);
if (numLinks++ >= limit && limit > 0) {
break;
}
cluster.queue({url: newUrl});
}
}
} catch (e) {
console.log("Queuing Error: " + e);
}
});
numLinks++;
cluster.queue({url});
await cluster.idle();
await cluster.close();
// extra wait for all resources to land into WARCs
console.log("Waiting 30s to ensure WARCs are finished");
await sleep(30000);
}
function shouldCrawl(scope, seenList, url, exclude) {
try {
url = new URL(url);
} catch(e) {
return false;
}
// remove hashtag
url.hash = "";
// only queue http/https URLs
if (url.protocol != "http:" && url.protocol != "https:") {
return false;
}
url = url.href;
// skip already crawled
if (seenList.has(url)) {
return false;
}
let inScope = false;
// check scopes
for (const s of scope) {
if (s.exec(url)) {
inScope = true;
break;
}
}
if (!inScope) {
//console.log(`Not in scope ${url} ${scope}`);
return false;
}
// check exclusions
for (const e of exclude) {
if (e.exec(url)) {
//console.log(`Skipping ${url} excluded by ${e}`);
return false;
}
}
return url;
}
async function htmlCheck(url, capturePrefix) {
try {
const agent = url.startsWith("https:") ? HTTPS_AGENT : null;
const resp = await fetch(url, {method: "HEAD", headers, agent});
if (resp.status >= 400) {
console.log(`Skipping ${url}, invalid status ${resp.status}`);
return false;
}
const contentType = resp.headers.get("Content-Type");
// just load if no content-type
if (!contentType) {
return true;
}
const mime = contentType.split(";")[0];
if (HTML_TYPES.includes(mime)) {
return true;
}
// capture directly
await directCapture(url);
return false;
} catch(e) {
console.log("HTML Check error", e);
// can't confirm not html, so try in browser
return true;
}
}
async function directCapture(url) {
console.log(`Direct capture: ${capturePrefix}${url}`);
const abort = new AbortController();
const signal = abort.signal;
const resp2 = await fetch(capturePrefix + url, {signal, headers});
abort.abort();
}
async function autoScroll() {
const canScrollMore = () =>
self.scrollY + self.innerHeight <
Math.max(
self.document.body.scrollHeight,
self.document.body.offsetHeight,
self.document.documentElement.clientHeight,
self.document.documentElement.scrollHeight,
self.document.documentElement.offsetHeight
);
const scrollOpts = { top: 250, left: 0, behavior: 'auto' };
while (canScrollMore()) {
self.scrollBy(scrollOpts);
await new Promise(resolve => setTimeout(resolve, 500));
}
}
function sleep(time) {
return new Promise(resolve => setTimeout(resolve, time));
}
async function main() {
const params = require('yargs')
.usage("browsertrix-mini [options]")
.options({
"url": {
alias: "u",
describe: "The URL to start crawling from",
demandOption: true,
type: "string",
},
"workers": {
alias: "w",
describe: "The number of workers to run in parallel",
demandOption: false,
default: 1,
type: "number",
},
"newContext": {
describe: "The context for each new capture, can be a new: page, session or browser.",
default: "page",
type: "string"
},
"waitUntil": {
describe: "Puppeteer page.goto() condition to wait for before continuing",
default: "load",
},
"limit": {
describe: "Limit crawl to this number of pages",
default: 0,
type: "number",
},
"timeout": {
describe: "Timeout for each page to load (in seconds)",
default: 90,
type: "number",
},
"scope": {
describe: "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)",
},
"exclude": {
describe: "Regex of page URLs that should be excluded from the crawl."
},
"scroll": {
describe: "If set, will autoscroll to bottom of the page",
type: "boolean",
default: false,
}}).check((argv, option) => {
// Scope for crawl, default to the domain of the URL
const url = new URL(argv.url);
if (url.protocol !== "http:" && url.protocol != "https:") {
throw new Error("URL must start with http:// or https://");
}
// ensure valid url is used (adds trailing slash if missing)
argv.url = url.href;
if (!argv.scope) {
//argv.scope = url.href.slice(0, url.href.lastIndexOf("/") + 1);
argv.scope = [new RegExp("^" + rxEscape(url.href.slice(0, url.href.lastIndexOf("/") + 1)))];
}
argv.timeout *= 1000;
// waitUntil condition must be: load, domcontentloaded, networkidle0, networkidle2
// (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options)
if (!WAIT_UNTIL_OPTS.includes(argv.waitUntil)) {
throw new Error("Invalid waitUntil, must be one of: " + WAIT_UNTIL_OPTS.join(","));
}
if (!NEW_CONTEXT_OPTS.includes(argv.newContext)) {
throw new Error("Invalid newContext, must be one of: " + NEW_CONTEXT_OPTS.join(","));
}
// Support one or multiple exclude
if (argv.exclude) {
if (typeof(argv.exclude) === "string") {
argv.exclude = [new RegExp(argv.exclude)];
} else {
argv.exclude = argv.exclude.map(e => new RegExp(e));
}
} else {
argv.exclude = [];
}
// Support one or multiple scopes
if (argv.scope) {
if (typeof(argv.scope) === "string") {
argv.scope = [new RegExp(argv.scope)];
} else {
argv.scope = argv.scope.map(e => new RegExp(e));
}
} else {
argv.scope = [];
}
return true;
})
.argv;
console.log("Exclusions Regexes: ", params.exclude);
console.log("Scope Regexes: ", params.scope);
try {
await run(params);
process.exit(0);
} catch(e) {
console.error("Crawl failed");
console.error(e);
process.exit(1);
}
}
function rxEscape(string) {
return string.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&');
}
main();

View File

@ -1,15 +0,0 @@
{
"name": "zimit-crawler",
"version": "1.0.0",
"main": "zimit-crawler",
"repository": "https://github.com/openzim/zimit",
"author": "Ilya Kreymer <ikreymer@gmail.com>",
"license": "MIT",
"dependencies": {
"abort-controller": "^3.0.0",
"node-fetch": "^2.6.1",
"puppeteer-cluster": "^0.22.0",
"puppeteer-core": "^5.3.1",
"yargs": "^16.0.3"
}
}

View File

@ -1,27 +0,0 @@
[uwsgi]
if-not-env = PORT
http-socket = :8080
socket = :8081
endif =
master = true
buffer-size = 65536
die-on-term = true
if-env = VIRTUAL_ENV
venv = $(VIRTUAL_ENV)
endif =
gevent = 200
#Not available until uwsgi 2.1
#monkey-patching manually in pywb.apps.wayback
#gevent-early-monkey-patch =
# for uwsgi<2.1, set env when using gevent
env = GEVENT_MONKEY_PATCH=1
processes = 8
# specify config file here
env = PYWB_CONFIG_FILE=/app/config.yaml
wsgi = pywb.apps.wayback

498
yarn.lock
View File

@ -1,498 +0,0 @@
# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY.
# yarn lockfile v1
"@types/color-name@^1.1.1":
version "1.1.1"
resolved "https://registry.yarnpkg.com/@types/color-name/-/color-name-1.1.1.tgz#1c1261bbeaa10a8055bbc5d8ab84b7b2afc846a0"
integrity sha512-rr+OQyAjxze7GgWrSaJwydHStIhHq2lvY3BOC2Mj7KnzI7XK0Uw1TOOdI9lDoajEbSWLiYgoo4f1R51erQfhPQ==
"@types/node@*":
version "14.14.0"
resolved "https://registry.yarnpkg.com/@types/node/-/node-14.14.0.tgz#f1091b6ad5de18e8e91bdbd43ec63f13de372538"
integrity sha512-BfbIHP9IapdupGhq/hc+jT5dyiBVZ2DdeC5WwJWQWDb0GijQlzUFAeIQn/2GtvZcd2HVUU7An8felIICFTC2qg==
"@types/yauzl@^2.9.1":
version "2.9.1"
resolved "https://registry.yarnpkg.com/@types/yauzl/-/yauzl-2.9.1.tgz#d10f69f9f522eef3cf98e30afb684a1e1ec923af"
integrity sha512-A1b8SU4D10uoPjwb0lnHmmu8wZhR9d+9o2PKBQT2jU5YPTKsxac6M2qGAdY7VcL+dHHhARVUDmeg0rOrcd9EjA==
dependencies:
"@types/node" "*"
abort-controller@^3.0.0:
version "3.0.0"
resolved "https://registry.yarnpkg.com/abort-controller/-/abort-controller-3.0.0.tgz#eaf54d53b62bae4138e809ca225c8439a6efb392"
integrity sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==
dependencies:
event-target-shim "^5.0.0"
agent-base@5:
version "5.1.1"
resolved "https://registry.yarnpkg.com/agent-base/-/agent-base-5.1.1.tgz#e8fb3f242959db44d63be665db7a8e739537a32c"
integrity sha512-TMeqbNl2fMW0nMjTEPOwe3J/PRFP4vqeoNuQMG0HlMrtm5QxKqdvAkZ1pRBQ/ulIyDD5Yq0nJ7YbdD8ey0TO3g==
ansi-regex@^5.0.0:
version "5.0.0"
resolved "https://registry.yarnpkg.com/ansi-regex/-/ansi-regex-5.0.0.tgz#388539f55179bf39339c81af30a654d69f87cb75"
integrity sha512-bY6fj56OUQ0hU1KjFNDQuJFezqKdrAyFdIevADiqrWHwSlbmBNMHp5ak2f40Pm8JTFyM2mqxkG6ngkHO11f/lg==
ansi-styles@^4.0.0:
version "4.2.1"
resolved "https://registry.yarnpkg.com/ansi-styles/-/ansi-styles-4.2.1.tgz#90ae75c424d008d2624c5bf29ead3177ebfcf359"
integrity sha512-9VGjrMsG1vePxcSweQsN20KY/c4zN0h9fLjqAbwbPfahM3t+NL+M9HC8xeXG2I8pX5NoamTGNuomEUFI7fcUjA==
dependencies:
"@types/color-name" "^1.1.1"
color-convert "^2.0.1"
balanced-match@^1.0.0:
version "1.0.0"
resolved "https://registry.yarnpkg.com/balanced-match/-/balanced-match-1.0.0.tgz#89b4d199ab2bee49de164ea02b89ce462d71b767"
integrity sha1-ibTRmasr7kneFk6gK4nORi1xt2c=
base64-js@^1.0.2:
version "1.3.1"
resolved "https://registry.yarnpkg.com/base64-js/-/base64-js-1.3.1.tgz#58ece8cb75dd07e71ed08c736abc5fac4dbf8df1"
integrity sha512-mLQ4i2QO1ytvGWFWmcngKO//JXAQueZvwEKtjgQFM4jIK0kU+ytMfplL8j+n5mspOfjHwoAg+9yhb7BwAHm36g==
bl@^4.0.3:
version "4.0.3"
resolved "https://registry.yarnpkg.com/bl/-/bl-4.0.3.tgz#12d6287adc29080e22a705e5764b2a9522cdc489"
integrity sha512-fs4G6/Hu4/EE+F75J8DuN/0IpQqNjAdC7aEQv7Qt8MHGUH7Ckv2MwTEEeN9QehD0pfIDkMI1bkHYkKy7xHyKIg==
dependencies:
buffer "^5.5.0"
inherits "^2.0.4"
readable-stream "^3.4.0"
brace-expansion@^1.1.7:
version "1.1.11"
resolved "https://registry.yarnpkg.com/brace-expansion/-/brace-expansion-1.1.11.tgz#3c7fcbf529d87226f3d2f52b966ff5271eb441dd"
integrity sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==
dependencies:
balanced-match "^1.0.0"
concat-map "0.0.1"
buffer-crc32@~0.2.3:
version "0.2.13"
resolved "https://registry.yarnpkg.com/buffer-crc32/-/buffer-crc32-0.2.13.tgz#0d333e3f00eac50aa1454abd30ef8c2a5d9a7242"
integrity sha1-DTM+PwDqxQqhRUq9MO+MKl2ackI=
buffer@^5.2.1, buffer@^5.5.0:
version "5.6.0"
resolved "https://registry.yarnpkg.com/buffer/-/buffer-5.6.0.tgz#a31749dc7d81d84db08abf937b6b8c4033f62786"
integrity sha512-/gDYp/UtU0eA1ys8bOs9J6a+E/KWIY+DZ+Q2WESNUA0jFRsJOc0SNUO6xJ5SGA1xueg3NL65W6s+NY5l9cunuw==
dependencies:
base64-js "^1.0.2"
ieee754 "^1.1.4"
chownr@^1.1.1:
version "1.1.4"
resolved "https://registry.yarnpkg.com/chownr/-/chownr-1.1.4.tgz#6fc9d7b42d32a583596337666e7d08084da2cc6b"
integrity sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==
cliui@^7.0.0:
version "7.0.1"
resolved "https://registry.yarnpkg.com/cliui/-/cliui-7.0.1.tgz#a4cb67aad45cd83d8d05128fc9f4d8fbb887e6b3"
integrity sha512-rcvHOWyGyid6I1WjT/3NatKj2kDt9OdSHSXpyLXaMWFbKpGACNW8pRhhdPUq9MWUOdwn8Rz9AVETjF4105rZZQ==
dependencies:
string-width "^4.2.0"
strip-ansi "^6.0.0"
wrap-ansi "^7.0.0"
color-convert@^2.0.1:
version "2.0.1"
resolved "https://registry.yarnpkg.com/color-convert/-/color-convert-2.0.1.tgz#72d3a68d598c9bdb3af2ad1e84f21d896abd4de3"
integrity sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==
dependencies:
color-name "~1.1.4"
color-name@~1.1.4:
version "1.1.4"
resolved "https://registry.yarnpkg.com/color-name/-/color-name-1.1.4.tgz#c2a09a87acbde69543de6f63fa3995c826c536a2"
integrity sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==
concat-map@0.0.1:
version "0.0.1"
resolved "https://registry.yarnpkg.com/concat-map/-/concat-map-0.0.1.tgz#d8a96bd77fd68df7793a73036a3ba0d5405d477b"
integrity sha1-2Klr13/Wjfd5OnMDajug1UBdR3s=
debug@4, debug@^4.1.0, debug@^4.1.1:
version "4.2.0"
resolved "https://registry.yarnpkg.com/debug/-/debug-4.2.0.tgz#7f150f93920e94c58f5574c2fd01a3110effe7f1"
integrity sha512-IX2ncY78vDTjZMFUdmsvIRFY2Cf4FnD0wRs+nQwJU8Lu99/tPFdb0VybiiMTPe3I6rQmwsqQqRBvxU+bZ/I8sg==
dependencies:
ms "2.1.2"
devtools-protocol@0.0.799653:
version "0.0.799653"
resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.799653.tgz#86fc95ce5bf4fdf4b77a58047ba9d2301078f119"
integrity sha512-t1CcaZbvm8pOlikqrsIM9GOa7Ipp07+4h/q9u0JXBWjPCjHdBl9KkddX87Vv9vBHoBGtwV79sYQNGnQM6iS5gg==
emoji-regex@^8.0.0:
version "8.0.0"
resolved "https://registry.yarnpkg.com/emoji-regex/-/emoji-regex-8.0.0.tgz#e818fd69ce5ccfcb404594f842963bf53164cc37"
integrity sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==
end-of-stream@^1.1.0, end-of-stream@^1.4.1:
version "1.4.4"
resolved "https://registry.yarnpkg.com/end-of-stream/-/end-of-stream-1.4.4.tgz#5ae64a5f45057baf3626ec14da0ca5e4b2431eb0"
integrity sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q==
dependencies:
once "^1.4.0"
escalade@^3.0.2:
version "3.1.0"
resolved "https://registry.yarnpkg.com/escalade/-/escalade-3.1.0.tgz#e8e2d7c7a8b76f6ee64c2181d6b8151441602d4e"
integrity sha512-mAk+hPSO8fLDkhV7V0dXazH5pDc6MrjBTPyD3VeKzxnVFjH1MIxbCdqGZB9O8+EwWakZs3ZCbDS4IpRt79V1ig==
event-target-shim@^5.0.0:
version "5.0.1"
resolved "https://registry.yarnpkg.com/event-target-shim/-/event-target-shim-5.0.1.tgz#5d4d3ebdf9583d63a5333ce2deb7480ab2b05789"
integrity sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==
extract-zip@^2.0.0:
version "2.0.1"
resolved "https://registry.yarnpkg.com/extract-zip/-/extract-zip-2.0.1.tgz#663dca56fe46df890d5f131ef4a06d22bb8ba13a"
integrity sha512-GDhU9ntwuKyGXdZBUgTIe+vXnWj0fppUEtMDL0+idd5Sta8TGpHssn/eusA9mrPr9qNDym6SxAYZjNvCn/9RBg==
dependencies:
debug "^4.1.1"
get-stream "^5.1.0"
yauzl "^2.10.0"
optionalDependencies:
"@types/yauzl" "^2.9.1"
fd-slicer@~1.1.0:
version "1.1.0"
resolved "https://registry.yarnpkg.com/fd-slicer/-/fd-slicer-1.1.0.tgz#25c7c89cb1f9077f8891bbe61d8f390eae256f1e"
integrity sha1-JcfInLH5B3+IkbvmHY85Dq4lbx4=
dependencies:
pend "~1.2.0"
find-up@^4.0.0:
version "4.1.0"
resolved "https://registry.yarnpkg.com/find-up/-/find-up-4.1.0.tgz#97afe7d6cdc0bc5928584b7c8d7b16e8a9aa5d19"
integrity sha512-PpOwAdQ/YlXQ2vj8a3h8IipDuYRi3wceVQQGYWxNINccq40Anw7BlsEXCMbt1Zt+OLA6Fq9suIpIWD0OsnISlw==
dependencies:
locate-path "^5.0.0"
path-exists "^4.0.0"
fs-constants@^1.0.0:
version "1.0.0"
resolved "https://registry.yarnpkg.com/fs-constants/-/fs-constants-1.0.0.tgz#6be0de9be998ce16af8afc24497b9ee9b7ccd9ad"
integrity sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==
fs.realpath@^1.0.0:
version "1.0.0"
resolved "https://registry.yarnpkg.com/fs.realpath/-/fs.realpath-1.0.0.tgz#1504ad2523158caa40db4a2787cb01411994ea4f"
integrity sha1-FQStJSMVjKpA20onh8sBQRmU6k8=
get-caller-file@^2.0.5:
version "2.0.5"
resolved "https://registry.yarnpkg.com/get-caller-file/-/get-caller-file-2.0.5.tgz#4f94412a82db32f36e3b0b9741f8a97feb031f7e"
integrity sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==
get-stream@^5.1.0:
version "5.2.0"
resolved "https://registry.yarnpkg.com/get-stream/-/get-stream-5.2.0.tgz#4966a1795ee5ace65e706c4b7beb71257d6e22d3"
integrity sha512-nBF+F1rAZVCu/p7rjzgA+Yb4lfYXrpl7a6VmJrU8wF9I1CKvP/QwPNZHnOlwbTkY6dvtFIzFMSyQXbLoTQPRpA==
dependencies:
pump "^3.0.0"
glob@^7.1.3:
version "7.1.6"
resolved "https://registry.yarnpkg.com/glob/-/glob-7.1.6.tgz#141f33b81a7c2492e125594307480c46679278a6"
integrity sha512-LwaxwyZ72Lk7vZINtNNrywX0ZuLyStrdDtabefZKAY5ZGJhVtgdznluResxNmPitE0SAO+O26sWTHeKSI2wMBA==
dependencies:
fs.realpath "^1.0.0"
inflight "^1.0.4"
inherits "2"
minimatch "^3.0.4"
once "^1.3.0"
path-is-absolute "^1.0.0"
https-proxy-agent@^4.0.0:
version "4.0.0"
resolved "https://registry.yarnpkg.com/https-proxy-agent/-/https-proxy-agent-4.0.0.tgz#702b71fb5520a132a66de1f67541d9e62154d82b"
integrity sha512-zoDhWrkR3of1l9QAL8/scJZyLu8j/gBkcwcaQOZh7Gyh/+uJQzGVETdgT30akuwkpL8HTRfssqI3BZuV18teDg==
dependencies:
agent-base "5"
debug "4"
ieee754@^1.1.4:
version "1.1.13"
resolved "https://registry.yarnpkg.com/ieee754/-/ieee754-1.1.13.tgz#ec168558e95aa181fd87d37f55c32bbcb6708b84"
integrity sha512-4vf7I2LYV/HaWerSo3XmlMkp5eZ83i+/CDluXi/IGTs/O1sejBNhTtnxzmRZfvOUqj7lZjqHkeTvpgSFDlWZTg==
inflight@^1.0.4:
version "1.0.6"
resolved "https://registry.yarnpkg.com/inflight/-/inflight-1.0.6.tgz#49bd6331d7d02d0c09bc910a1075ba8165b56df9"
integrity sha1-Sb1jMdfQLQwJvJEKEHW6gWW1bfk=
dependencies:
once "^1.3.0"
wrappy "1"
inherits@2, inherits@^2.0.3, inherits@^2.0.4:
version "2.0.4"
resolved "https://registry.yarnpkg.com/inherits/-/inherits-2.0.4.tgz#0fa2c64f932917c3433a0ded55363aae37416b7c"
integrity sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==
is-fullwidth-code-point@^3.0.0:
version "3.0.0"
resolved "https://registry.yarnpkg.com/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz#f116f8064fe90b3f7844a38997c0b75051269f1d"
integrity sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==
locate-path@^5.0.0:
version "5.0.0"
resolved "https://registry.yarnpkg.com/locate-path/-/locate-path-5.0.0.tgz#1afba396afd676a6d42504d0a67a3a7eb9f62aa0"
integrity sha512-t7hw9pI+WvuwNJXwk5zVHpyhIqzg2qTlklJOf0mVxGSbe3Fp2VieZcduNYjaLDoy6p9uGpQEGWG87WpMKlNq8g==
dependencies:
p-locate "^4.1.0"
minimatch@^3.0.4:
version "3.0.4"
resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-3.0.4.tgz#5166e286457f03306064be5497e8dbb0c3d32083"
integrity sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA==
dependencies:
brace-expansion "^1.1.7"
mkdirp-classic@^0.5.2:
version "0.5.3"
resolved "https://registry.yarnpkg.com/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz#fa10c9115cc6d8865be221ba47ee9bed78601113"
integrity sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==
ms@2.1.2:
version "2.1.2"
resolved "https://registry.yarnpkg.com/ms/-/ms-2.1.2.tgz#d09d1f357b443f493382a8eb3ccd183872ae6009"
integrity sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==
node-fetch@^2.6.1:
version "2.6.1"
resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.1.tgz#045bd323631f76ed2e2b55573394416b639a0052"
integrity sha512-V4aYg89jEoVRxRb2fJdAg8FHvI7cEyYdVAh94HH0UIK8oJxUfkjlDQN9RbMx+bEjP7+ggMiFRprSti032Oipxw==
once@^1.3.0, once@^1.3.1, once@^1.4.0:
version "1.4.0"
resolved "https://registry.yarnpkg.com/once/-/once-1.4.0.tgz#583b1aa775961d4b113ac17d9c50baef9dd76bd1"
integrity sha1-WDsap3WWHUsROsF9nFC6753Xa9E=
dependencies:
wrappy "1"
p-limit@^2.2.0:
version "2.3.0"
resolved "https://registry.yarnpkg.com/p-limit/-/p-limit-2.3.0.tgz#3dd33c647a214fdfffd835933eb086da0dc21db1"
integrity sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w==
dependencies:
p-try "^2.0.0"
p-locate@^4.1.0:
version "4.1.0"
resolved "https://registry.yarnpkg.com/p-locate/-/p-locate-4.1.0.tgz#a3428bb7088b3a60292f66919278b7c297ad4f07"
integrity sha512-R79ZZ/0wAxKGu3oYMlz8jy/kbhsNrS7SKZ7PxEHBgJ5+F2mtFW2fK2cOtBh1cHYkQsbzFV7I+EoRKe6Yt0oK7A==
dependencies:
p-limit "^2.2.0"
p-try@^2.0.0:
version "2.2.0"
resolved "https://registry.yarnpkg.com/p-try/-/p-try-2.2.0.tgz#cb2868540e313d61de58fafbe35ce9004d5540e6"
integrity sha512-R4nPAVTAU0B9D35/Gk3uJf/7XYbQcyohSKdvAxIRSNghFl4e71hVoGnBNQz9cWaXxO2I10KTC+3jMdvvoKw6dQ==
path-exists@^4.0.0:
version "4.0.0"
resolved "https://registry.yarnpkg.com/path-exists/-/path-exists-4.0.0.tgz#513bdbe2d3b95d7762e8c1137efa195c6c61b5b3"
integrity sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w==
path-is-absolute@^1.0.0:
version "1.0.1"
resolved "https://registry.yarnpkg.com/path-is-absolute/-/path-is-absolute-1.0.1.tgz#174b9268735534ffbc7ace6bf53a5a9e1b5c5f5f"
integrity sha1-F0uSaHNVNP+8es5r9TpanhtcX18=
pend@~1.2.0:
version "1.2.0"
resolved "https://registry.yarnpkg.com/pend/-/pend-1.2.0.tgz#7a57eb550a6783f9115331fcf4663d5c8e007a50"
integrity sha1-elfrVQpng/kRUzH89GY9XI4AelA=
pkg-dir@^4.2.0:
version "4.2.0"
resolved "https://registry.yarnpkg.com/pkg-dir/-/pkg-dir-4.2.0.tgz#f099133df7ede422e81d1d8448270eeb3e4261f3"
integrity sha512-HRDzbaKjC+AOWVXxAU/x54COGeIv9eb+6CkDSQoNTt4XyWoIJvuPsXizxu/Fr23EiekbtZwmh1IcIG/l/a10GQ==
dependencies:
find-up "^4.0.0"
progress@^2.0.1:
version "2.0.3"
resolved "https://registry.yarnpkg.com/progress/-/progress-2.0.3.tgz#7e8cf8d8f5b8f239c1bc68beb4eb78567d572ef8"
integrity sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==
proxy-from-env@^1.0.0:
version "1.1.0"
resolved "https://registry.yarnpkg.com/proxy-from-env/-/proxy-from-env-1.1.0.tgz#e102f16ca355424865755d2c9e8ea4f24d58c3e2"
integrity sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==
pump@^3.0.0:
version "3.0.0"
resolved "https://registry.yarnpkg.com/pump/-/pump-3.0.0.tgz#b4a2116815bde2f4e1ea602354e8c75565107a64"
integrity sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==
dependencies:
end-of-stream "^1.1.0"
once "^1.3.1"
puppeteer-cluster@^0.22.0:
version "0.22.0"
resolved "https://registry.yarnpkg.com/puppeteer-cluster/-/puppeteer-cluster-0.22.0.tgz#4ab214671f414f15ad6a94a4b61ed0b4172e86e6"
integrity sha512-hmydtMwfVM+idFIDzS8OXetnujHGre7RY3BGL+3njy9+r8Dcu3VALkZHfuBEPf6byKssTCgzxU1BvLczifXd5w==
dependencies:
debug "^4.1.1"
puppeteer-core@^5.3.1:
version "5.3.1"
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-5.3.1.tgz#1affb1738afac499416a7fd4ed2ed0c18577e88f"
integrity sha512-YE6c6FvHAFKQUyNTqFs78SgGmpcqOPhhmVfEVNYB4abv7bV2V+B3r72T3e7vlJkEeTloy4x9bQLrGbHHoKSg1w==
dependencies:
debug "^4.1.0"
devtools-protocol "0.0.799653"
extract-zip "^2.0.0"
https-proxy-agent "^4.0.0"
pkg-dir "^4.2.0"
progress "^2.0.1"
proxy-from-env "^1.0.0"
rimraf "^3.0.2"
tar-fs "^2.0.0"
unbzip2-stream "^1.3.3"
ws "^7.2.3"
readable-stream@^3.1.1, readable-stream@^3.4.0:
version "3.6.0"
resolved "https://registry.yarnpkg.com/readable-stream/-/readable-stream-3.6.0.tgz#337bbda3adc0706bd3e024426a286d4b4b2c9198"
integrity sha512-BViHy7LKeTz4oNnkcLJ+lVSL6vpiFeX6/d3oSH8zCW7UxP2onchk+vTGB143xuFjHS3deTgkKoXXymXqymiIdA==
dependencies:
inherits "^2.0.3"
string_decoder "^1.1.1"
util-deprecate "^1.0.1"
require-directory@^2.1.1:
version "2.1.1"
resolved "https://registry.yarnpkg.com/require-directory/-/require-directory-2.1.1.tgz#8c64ad5fd30dab1c976e2344ffe7f792a6a6df42"
integrity sha1-jGStX9MNqxyXbiNE/+f3kqam30I=
rimraf@^3.0.2:
version "3.0.2"
resolved "https://registry.yarnpkg.com/rimraf/-/rimraf-3.0.2.tgz#f1a5402ba6220ad52cc1282bac1ae3aa49fd061a"
integrity sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==
dependencies:
glob "^7.1.3"
safe-buffer@~5.2.0:
version "5.2.1"
resolved "https://registry.yarnpkg.com/safe-buffer/-/safe-buffer-5.2.1.tgz#1eaf9fa9bdb1fdd4ec75f58f9cdb4e6b7827eec6"
integrity sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==
string-width@^4.1.0, string-width@^4.2.0:
version "4.2.0"
resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.0.tgz#952182c46cc7b2c313d1596e623992bd163b72b5"
integrity sha512-zUz5JD+tgqtuDjMhwIg5uFVV3dtqZ9yQJlZVfq4I01/K5Paj5UHj7VyrQOJvzawSVlKpObApbfD0Ed6yJc+1eg==
dependencies:
emoji-regex "^8.0.0"
is-fullwidth-code-point "^3.0.0"
strip-ansi "^6.0.0"
string_decoder@^1.1.1:
version "1.3.0"
resolved "https://registry.yarnpkg.com/string_decoder/-/string_decoder-1.3.0.tgz#42f114594a46cf1a8e30b0a84f56c78c3edac21e"
integrity sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==
dependencies:
safe-buffer "~5.2.0"
strip-ansi@^6.0.0:
version "6.0.0"
resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.0.tgz#0b1571dd7669ccd4f3e06e14ef1eed26225ae532"
integrity sha512-AuvKTrTfQNYNIctbR1K/YGTR1756GycPsg7b9bdV9Duqur4gv6aKqHXah67Z8ImS7WEz5QVcOtlfW2rZEugt6w==
dependencies:
ansi-regex "^5.0.0"
tar-fs@^2.0.0:
version "2.1.0"
resolved "https://registry.yarnpkg.com/tar-fs/-/tar-fs-2.1.0.tgz#d1cdd121ab465ee0eb9ccde2d35049d3f3daf0d5"
integrity sha512-9uW5iDvrIMCVpvasdFHW0wJPez0K4JnMZtsuIeDI7HyMGJNxmDZDOCQROr7lXyS+iL/QMpj07qcjGYTSdRFXUg==
dependencies:
chownr "^1.1.1"
mkdirp-classic "^0.5.2"
pump "^3.0.0"
tar-stream "^2.0.0"
tar-stream@^2.0.0:
version "2.1.4"
resolved "https://registry.yarnpkg.com/tar-stream/-/tar-stream-2.1.4.tgz#c4fb1a11eb0da29b893a5b25476397ba2d053bfa"
integrity sha512-o3pS2zlG4gxr67GmFYBLlq+dM8gyRGUOvsrHclSkvtVtQbjV0s/+ZE8OpICbaj8clrX3tjeHngYGP7rweaBnuw==
dependencies:
bl "^4.0.3"
end-of-stream "^1.4.1"
fs-constants "^1.0.0"
inherits "^2.0.3"
readable-stream "^3.1.1"
through@^2.3.8:
version "2.3.8"
resolved "https://registry.yarnpkg.com/through/-/through-2.3.8.tgz#0dd4c9ffaabc357960b1b724115d7e0e86a2e1f5"
integrity sha1-DdTJ/6q8NXlgsbckEV1+Doai4fU=
unbzip2-stream@^1.3.3:
version "1.4.3"
resolved "https://registry.yarnpkg.com/unbzip2-stream/-/unbzip2-stream-1.4.3.tgz#b0da04c4371311df771cdc215e87f2130991ace7"
integrity sha512-mlExGW4w71ebDJviH16lQLtZS32VKqsSfk80GCfUlwT/4/hNRFsoscrF/c++9xinkMzECL1uL9DDwXqFWkruPg==
dependencies:
buffer "^5.2.1"
through "^2.3.8"
util-deprecate@^1.0.1:
version "1.0.2"
resolved "https://registry.yarnpkg.com/util-deprecate/-/util-deprecate-1.0.2.tgz#450d4dc9fa70de732762fbd2d4a28981419a0ccf"
integrity sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8=
wrap-ansi@^7.0.0:
version "7.0.0"
resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43"
integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==
dependencies:
ansi-styles "^4.0.0"
string-width "^4.1.0"
strip-ansi "^6.0.0"
wrappy@1:
version "1.0.2"
resolved "https://registry.yarnpkg.com/wrappy/-/wrappy-1.0.2.tgz#b5243d8f3ec1aa35f1364605bc0d1036e30ab69f"
integrity sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8=
ws@^7.2.3:
version "7.3.1"
resolved "https://registry.yarnpkg.com/ws/-/ws-7.3.1.tgz#d0547bf67f7ce4f12a72dfe31262c68d7dc551c8"
integrity sha512-D3RuNkynyHmEJIpD2qrgVkc9DQ23OrN/moAwZX4L8DfvszsJxpjQuUq3LMx6HoYji9fbIOBY18XWBsAux1ZZUA==
y18n@^5.0.1:
version "5.0.1"
resolved "https://registry.yarnpkg.com/y18n/-/y18n-5.0.1.tgz#1ad2a7eddfa8bce7caa2e1f6b5da96c39d99d571"
integrity sha512-/jJ831jEs4vGDbYPQp4yGKDYPSCCEQ45uZWJHE1AoYBzqdZi8+LDWas0z4HrmJXmKdpFsTiowSHXdxyFhpmdMg==
yargs-parser@^20.0.0:
version "20.0.0"
resolved "https://registry.yarnpkg.com/yargs-parser/-/yargs-parser-20.0.0.tgz#c65a1daaa977ad63cebdd52159147b789a4e19a9"
integrity sha512-8eblPHTL7ZWRkyjIZJjnGf+TijiKJSwA24svzLRVvtgoi/RZiKa9fFQTrlx0OKLnyHSdt/enrdadji6WFfESVA==
yargs@^16.0.3:
version "16.0.3"
resolved "https://registry.yarnpkg.com/yargs/-/yargs-16.0.3.tgz#7a919b9e43c90f80d4a142a89795e85399a7e54c"
integrity sha512-6+nLw8xa9uK1BOEOykaiYAJVh6/CjxWXK/q9b5FpRgNslt8s22F2xMBqVIKgCRjNgGvGPBy8Vog7WN7yh4amtA==
dependencies:
cliui "^7.0.0"
escalade "^3.0.2"
get-caller-file "^2.0.5"
require-directory "^2.1.1"
string-width "^4.2.0"
y18n "^5.0.1"
yargs-parser "^20.0.0"
yauzl@^2.10.0:
version "2.10.0"
resolved "https://registry.yarnpkg.com/yauzl/-/yauzl-2.10.0.tgz#c7eb17c93e112cb1086fa6d8e51fb0667b79a5f9"
integrity sha1-x+sXyT4RLLEIb6bY5R+wZnt5pfk=
dependencies:
buffer-crc32 "~0.2.3"
fd-slicer "~1.1.0"

View File

@ -9,17 +9,17 @@ and then calls the Node based driver
"""
from argparse import ArgumentParser
import os
import tempfile
import subprocess
import atexit
import shutil
import glob
import signal
import sys
from pathlib import Path
from urllib.parse import urlsplit
from warc2zim.main import warc2zim
import requests
def zimit(args=None):
@ -33,7 +33,7 @@ def zimit(args=None):
parser.add_argument(
"--newContext",
help="The context for each new capture, can be a new: page, session or browser.",
help="The context for each new capture (page, session or browser).",
choices=["page", "session", "browser"],
default="page",
)
@ -58,7 +58,8 @@ def zimit(args=None):
parser.add_argument(
"--scope",
help="Regex of page URLs that should be included in the crawl (defaults to the immediate directory of the URL)",
help="Regex of page URLs that should be included in the crawl "
"(defaults to the immediate directory of the URL)",
)
parser.add_argument(
@ -89,9 +90,12 @@ def zimit(args=None):
warc2zim_args.append("--output")
warc2zim_args.append(zimit_args.output)
if zimit_args.url:
url = zimit_args.url
if url:
url = check_url(url)
warc2zim_args.append("--url")
warc2zim_args.append(zimit_args.url)
warc2zim_args.append(url)
print("----------")
print("Testing warc2zim args")
@ -109,36 +113,24 @@ def zimit(args=None):
def cleanup():
print("")
print("----------")
print("Cleanup, removing temp dir: " + str(temp_root_dir))
print(f"Cleanup, removing temp dir: {temp_root_dir}")
shutil.rmtree(temp_root_dir)
atexit.register(cleanup)
# create pywb collection
print("")
print("----------")
print("pywb init")
subprocess.run(
["/usr/bin/env", "wb-manager", "init", "capture"], check=True, cwd=temp_root_dir
) # nosec
subprocess.Popen(
["/usr/bin/env", "redis-server"], cwd=temp_root_dir, stdout=subprocess.DEVNULL
) # nosec
subprocess.Popen(
["/usr/bin/env", "uwsgi", os.getcwd() + "/uwsgi.ini"],
cwd=temp_root_dir,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
) # nosec
cmd_args = get_node_cmd_line(zimit_args)
if url:
cmd_args.append("--url")
cmd_args.append(url)
cmd_args.append("--cwd")
cmd_args.append(str(temp_root_dir))
cmd_line = " ".join(cmd_args)
print("")
print("----------")
print("running zimit driver: " + cmd_line)
print(f"running browsertrix-crawler crawl: {cmd_line}")
subprocess.run(cmd_args, check=True)
warc_files = temp_root_dir / "collections" / "capture" / "archive"
@ -148,14 +140,34 @@ def zimit(args=None):
print("")
print("----------")
print("Processing {0} WARC files to ZIM".format(num_files))
print(f"Processing {num_files} WARC files to ZIM")
return warc2zim(warc2zim_args)
def check_url(url):
try:
resp = requests.head(url, stream=True, allow_redirects=True, timeout=10)
except requests.exceptions.RequestException as exc:
print(f"failed to connect to {url}: {exc}")
raise SystemExit(1)
actual_url = resp.url
if actual_url != url:
if urlsplit(url).netloc != urlsplit(actual_url).netloc:
raise ValueError(
f"Main page URL ({url}) redirects to out-of-scope domain "
f"({actual_url}), cancelling crawl"
)
return actual_url
return url
def get_node_cmd_line(args):
node_cmd = ["/usr/bin/env", "node", "crawler.js"]
node_cmd = ["crawl"]
for arg in [
"url",
"workers",
"newContext",
"waitUntil",