From 7346527a8108a0a20b9c64e3b4c97cfb36cd7915 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 19 Sep 2020 17:38:52 +0000 Subject: [PATCH 01/10] initial setup - single url capture with existing browser image, pywb, puppeteer and warc2zim --- .gitignore | 2 + Dockerfile | 46 ++++++- config.yaml | 17 +++ index.js | 52 ++++++++ package.json | 11 ++ run.sh | 17 +++ uwsgi.ini | 26 ++++ yarn.lock | 368 +++++++++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 537 insertions(+), 2 deletions(-) create mode 100644 config.yaml create mode 100644 index.js create mode 100644 package.json create mode 100755 run.sh create mode 100644 uwsgi.ini create mode 100644 yarn.lock diff --git a/.gitignore b/.gitignore index 22e9769..cbbc2e5 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ __pycache__ *.zim *.egg-info/ +collections/ +node_modules/ diff --git a/Dockerfile b/Dockerfile index 5ff8c33..2a9360d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,3 +1,45 @@ -FROM debian:buster-slim +FROM oldwebtoday/chrome:84 as chrome + +FROM nikolaik/python-nodejs + +ENV PROXY_HOST=localhost \ + PROXY_PORT=8080 \ + PROXY_CA_URL=http://wsgiprox/download/pem \ + PROXY_CA_FILE=/tmp/proxy-ca.pem \ + NO_SOCAT=1 + +RUN pip install pywb uwsgi warc2zim + +COPY --from=chrome /opt/google/chrome/ /opt/google/chrome/ + +COPY --from=chrome /app/ /browser/ +COPY --from=chrome /usr/lib/x86_64-linux-gnu/ /usr/lib/x86_64-linux-gnu/ +COPY --from=chrome /lib/x86_64-linux-gnu/libdbus* /lib/x86_64-linux-gnu/ +COPY --from=chrome /usr/bin/run_forever /usr/bin/ +COPY --from=chrome /usr/bin/wget /usr/bin/ +COPY --from=chrome /usr/bin/certutil /usr/bin/ + +RUN ln -s /opt/google/chrome/google-chrome /usr/bin/google-chrome + +RUN useradd zimit --shell /bin/bash --create-home \ + && usermod -a -G sudo zimit \ + && echo 'ALL ALL = (ALL) NOPASSWD: ALL' >> /etc/sudoers \ + && echo 'zimit:secret' | chpasswd + +WORKDIR /app + +ADD index.js /app/ +ADD package.json /app/ + +RUN chown -R zimit /app + +USER zimit + +RUN yarn install + +ADD config.yaml /app/ +ADD uwsgi.ini /app/ +ADD run.sh /app/ + +ENTRYPOINT ["/app/run.sh"] -CMD ["/bin/bash"] diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..2250640 --- /dev/null +++ b/config.yaml @@ -0,0 +1,17 @@ +debug: true + +proxy: + coll: capture + recording: true + + enable_banner: false + enable_wombat: true + +recorder: live + +autoindex: 10 + +enable_auto_fetch: true + +collections: + live: $live diff --git a/index.js b/index.js new file mode 100644 index 0000000..e68ffdb --- /dev/null +++ b/index.js @@ -0,0 +1,52 @@ +const puppeteer = require("puppeteer-core"); + +const PAGE_TIMEOUT = 60000; + +async function run() { + const defaultViewport = null; + const browserURL = `http://localhost:9222`; + let browser = null; + + console.log("waiting for browser..."); + + while (!browser) { + try { + browser = await puppeteer.connect({browserURL, defaultViewport}); + } catch (e) { + //console.log(e); + await new Promise((resolve) => setTimeout(resolve, 500)); + } + } + + console.log("connected!"); + + const pages = await browser.pages(); + + const page = pages.length ? pages[0] : await browser.newPage(); + + console.log(process.argv); + const url = process.argv.length > 2 ? process.argv[2] : ""; + + if (!url) { + throw "No URL specified, exiting"; + } + + await page.goto(url, {"waitUntil": "networkidle0", "timeout": PAGE_TIMEOUT}); + + console.log("loaded!"); +} + + +async function main() { + try { + await run(); + process.exit(0); + } catch(e) { + console.log(e); + process.exit(1); + } +} + +main(); + + diff --git a/package.json b/package.json new file mode 100644 index 0000000..1f4226d --- /dev/null +++ b/package.json @@ -0,0 +1,11 @@ +{ + "name": "zimit", + "version": "1.0.0", + "main": "index.js", + "repository": "https://github.com/openzim/zimit", + "author": "Ilya Kreymer ", + "license": "MIT", + "dependencies": { + "puppeteer-core": "^5.3.0" + } +} diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..f15e6d6 --- /dev/null +++ b/run.sh @@ -0,0 +1,17 @@ +#!/bin/bash +URL="$1" + +wb-manager init capture +uwsgi uwsgi.ini & + +/browser/browser_entrypoint.sh /browser/run.sh & + +node index.js "$URL" + +NAME=${NAME:=zimfile} + +stat /output + +warc2zim --url $URL --name $NAME --output=/output ./collections/capture/archive/*.warc.gz + + diff --git a/uwsgi.ini b/uwsgi.ini new file mode 100644 index 0000000..f415aa3 --- /dev/null +++ b/uwsgi.ini @@ -0,0 +1,26 @@ +[uwsgi] +if-not-env = PORT +http-socket = :8080 +socket = :8081 +endif = + +master = true +buffer-size = 65536 +die-on-term = true + +if-env = VIRTUAL_ENV +venv = $(VIRTUAL_ENV) +endif = + +gevent = 100 + +#Not available until uwsgi 2.1 +#monkey-patching manually in pywb.apps.wayback +#gevent-early-monkey-patch = +# for uwsgi<2.1, set env when using gevent +env = GEVENT_MONKEY_PATCH=1 + +# specify config file here +env = PYWB_CONFIG_FILE=config.yaml +wsgi = pywb.apps.wayback + diff --git a/yarn.lock b/yarn.lock new file mode 100644 index 0000000..f0f888d --- /dev/null +++ b/yarn.lock @@ -0,0 +1,368 @@ +# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY. +# yarn lockfile v1 + + +"@types/node@*": + version "14.11.1" + resolved "https://registry.yarnpkg.com/@types/node/-/node-14.11.1.tgz#56af902ad157e763f9ba63d671c39cda3193c835" + integrity sha512-oTQgnd0hblfLsJ6BvJzzSL+Inogp3lq9fGgqRkMB/ziKMgEUaFl801OncOzUmalfzt14N0oPHMK47ipl+wbTIw== + +"@types/yauzl@^2.9.1": + version "2.9.1" + resolved "https://registry.yarnpkg.com/@types/yauzl/-/yauzl-2.9.1.tgz#d10f69f9f522eef3cf98e30afb684a1e1ec923af" + integrity sha512-A1b8SU4D10uoPjwb0lnHmmu8wZhR9d+9o2PKBQT2jU5YPTKsxac6M2qGAdY7VcL+dHHhARVUDmeg0rOrcd9EjA== + dependencies: + "@types/node" "*" + +agent-base@5: + version "5.1.1" + resolved "https://registry.yarnpkg.com/agent-base/-/agent-base-5.1.1.tgz#e8fb3f242959db44d63be665db7a8e739537a32c" + integrity sha512-TMeqbNl2fMW0nMjTEPOwe3J/PRFP4vqeoNuQMG0HlMrtm5QxKqdvAkZ1pRBQ/ulIyDD5Yq0nJ7YbdD8ey0TO3g== + +balanced-match@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/balanced-match/-/balanced-match-1.0.0.tgz#89b4d199ab2bee49de164ea02b89ce462d71b767" + integrity sha1-ibTRmasr7kneFk6gK4nORi1xt2c= + +base64-js@^1.0.2: + version "1.3.1" + resolved "https://registry.yarnpkg.com/base64-js/-/base64-js-1.3.1.tgz#58ece8cb75dd07e71ed08c736abc5fac4dbf8df1" + integrity sha512-mLQ4i2QO1ytvGWFWmcngKO//JXAQueZvwEKtjgQFM4jIK0kU+ytMfplL8j+n5mspOfjHwoAg+9yhb7BwAHm36g== + +bl@^4.0.3: + version "4.0.3" + resolved "https://registry.yarnpkg.com/bl/-/bl-4.0.3.tgz#12d6287adc29080e22a705e5764b2a9522cdc489" + integrity sha512-fs4G6/Hu4/EE+F75J8DuN/0IpQqNjAdC7aEQv7Qt8MHGUH7Ckv2MwTEEeN9QehD0pfIDkMI1bkHYkKy7xHyKIg== + dependencies: + buffer "^5.5.0" + inherits "^2.0.4" + readable-stream "^3.4.0" + +brace-expansion@^1.1.7: + version "1.1.11" + resolved "https://registry.yarnpkg.com/brace-expansion/-/brace-expansion-1.1.11.tgz#3c7fcbf529d87226f3d2f52b966ff5271eb441dd" + integrity sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA== + dependencies: + balanced-match "^1.0.0" + concat-map "0.0.1" + +buffer-crc32@~0.2.3: + version "0.2.13" + resolved "https://registry.yarnpkg.com/buffer-crc32/-/buffer-crc32-0.2.13.tgz#0d333e3f00eac50aa1454abd30ef8c2a5d9a7242" + integrity sha1-DTM+PwDqxQqhRUq9MO+MKl2ackI= + +buffer@^5.2.1, buffer@^5.5.0: + version "5.6.0" + resolved "https://registry.yarnpkg.com/buffer/-/buffer-5.6.0.tgz#a31749dc7d81d84db08abf937b6b8c4033f62786" + integrity sha512-/gDYp/UtU0eA1ys8bOs9J6a+E/KWIY+DZ+Q2WESNUA0jFRsJOc0SNUO6xJ5SGA1xueg3NL65W6s+NY5l9cunuw== + dependencies: + base64-js "^1.0.2" + ieee754 "^1.1.4" + +chownr@^1.1.1: + version "1.1.4" + resolved "https://registry.yarnpkg.com/chownr/-/chownr-1.1.4.tgz#6fc9d7b42d32a583596337666e7d08084da2cc6b" + integrity sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg== + +concat-map@0.0.1: + version "0.0.1" + resolved "https://registry.yarnpkg.com/concat-map/-/concat-map-0.0.1.tgz#d8a96bd77fd68df7793a73036a3ba0d5405d477b" + integrity sha1-2Klr13/Wjfd5OnMDajug1UBdR3s= + +debug@4, debug@^4.1.0, debug@^4.1.1: + version "4.2.0" + resolved "https://registry.yarnpkg.com/debug/-/debug-4.2.0.tgz#7f150f93920e94c58f5574c2fd01a3110effe7f1" + integrity sha512-IX2ncY78vDTjZMFUdmsvIRFY2Cf4FnD0wRs+nQwJU8Lu99/tPFdb0VybiiMTPe3I6rQmwsqQqRBvxU+bZ/I8sg== + dependencies: + ms "2.1.2" + +devtools-protocol@0.0.799653: + version "0.0.799653" + resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.799653.tgz#86fc95ce5bf4fdf4b77a58047ba9d2301078f119" + integrity sha512-t1CcaZbvm8pOlikqrsIM9GOa7Ipp07+4h/q9u0JXBWjPCjHdBl9KkddX87Vv9vBHoBGtwV79sYQNGnQM6iS5gg== + +end-of-stream@^1.1.0, end-of-stream@^1.4.1: + version "1.4.4" + resolved "https://registry.yarnpkg.com/end-of-stream/-/end-of-stream-1.4.4.tgz#5ae64a5f45057baf3626ec14da0ca5e4b2431eb0" + integrity sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q== + dependencies: + once "^1.4.0" + +extract-zip@^2.0.0: + version "2.0.1" + resolved "https://registry.yarnpkg.com/extract-zip/-/extract-zip-2.0.1.tgz#663dca56fe46df890d5f131ef4a06d22bb8ba13a" + integrity sha512-GDhU9ntwuKyGXdZBUgTIe+vXnWj0fppUEtMDL0+idd5Sta8TGpHssn/eusA9mrPr9qNDym6SxAYZjNvCn/9RBg== + dependencies: + debug "^4.1.1" + get-stream "^5.1.0" + yauzl "^2.10.0" + optionalDependencies: + "@types/yauzl" "^2.9.1" + +fd-slicer@~1.1.0: + version "1.1.0" + resolved "https://registry.yarnpkg.com/fd-slicer/-/fd-slicer-1.1.0.tgz#25c7c89cb1f9077f8891bbe61d8f390eae256f1e" + integrity sha1-JcfInLH5B3+IkbvmHY85Dq4lbx4= + dependencies: + pend "~1.2.0" + +find-up@^4.0.0: + version "4.1.0" + resolved "https://registry.yarnpkg.com/find-up/-/find-up-4.1.0.tgz#97afe7d6cdc0bc5928584b7c8d7b16e8a9aa5d19" + integrity sha512-PpOwAdQ/YlXQ2vj8a3h8IipDuYRi3wceVQQGYWxNINccq40Anw7BlsEXCMbt1Zt+OLA6Fq9suIpIWD0OsnISlw== + dependencies: + locate-path "^5.0.0" + path-exists "^4.0.0" + +fs-constants@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/fs-constants/-/fs-constants-1.0.0.tgz#6be0de9be998ce16af8afc24497b9ee9b7ccd9ad" + integrity sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow== + +fs.realpath@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/fs.realpath/-/fs.realpath-1.0.0.tgz#1504ad2523158caa40db4a2787cb01411994ea4f" + integrity sha1-FQStJSMVjKpA20onh8sBQRmU6k8= + +get-stream@^5.1.0: + version "5.2.0" + resolved "https://registry.yarnpkg.com/get-stream/-/get-stream-5.2.0.tgz#4966a1795ee5ace65e706c4b7beb71257d6e22d3" + integrity sha512-nBF+F1rAZVCu/p7rjzgA+Yb4lfYXrpl7a6VmJrU8wF9I1CKvP/QwPNZHnOlwbTkY6dvtFIzFMSyQXbLoTQPRpA== + dependencies: + pump "^3.0.0" + +glob@^7.1.3: + version "7.1.6" + resolved "https://registry.yarnpkg.com/glob/-/glob-7.1.6.tgz#141f33b81a7c2492e125594307480c46679278a6" + integrity sha512-LwaxwyZ72Lk7vZINtNNrywX0ZuLyStrdDtabefZKAY5ZGJhVtgdznluResxNmPitE0SAO+O26sWTHeKSI2wMBA== + dependencies: + fs.realpath "^1.0.0" + inflight "^1.0.4" + inherits "2" + minimatch "^3.0.4" + once "^1.3.0" + path-is-absolute "^1.0.0" + +https-proxy-agent@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/https-proxy-agent/-/https-proxy-agent-4.0.0.tgz#702b71fb5520a132a66de1f67541d9e62154d82b" + integrity sha512-zoDhWrkR3of1l9QAL8/scJZyLu8j/gBkcwcaQOZh7Gyh/+uJQzGVETdgT30akuwkpL8HTRfssqI3BZuV18teDg== + dependencies: + agent-base "5" + debug "4" + +ieee754@^1.1.4: + version "1.1.13" + resolved "https://registry.yarnpkg.com/ieee754/-/ieee754-1.1.13.tgz#ec168558e95aa181fd87d37f55c32bbcb6708b84" + integrity sha512-4vf7I2LYV/HaWerSo3XmlMkp5eZ83i+/CDluXi/IGTs/O1sejBNhTtnxzmRZfvOUqj7lZjqHkeTvpgSFDlWZTg== + +inflight@^1.0.4: + version "1.0.6" + resolved "https://registry.yarnpkg.com/inflight/-/inflight-1.0.6.tgz#49bd6331d7d02d0c09bc910a1075ba8165b56df9" + integrity sha1-Sb1jMdfQLQwJvJEKEHW6gWW1bfk= + dependencies: + once "^1.3.0" + wrappy "1" + +inherits@2, inherits@^2.0.3, inherits@^2.0.4: + version "2.0.4" + resolved "https://registry.yarnpkg.com/inherits/-/inherits-2.0.4.tgz#0fa2c64f932917c3433a0ded55363aae37416b7c" + integrity sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ== + +locate-path@^5.0.0: + version "5.0.0" + resolved "https://registry.yarnpkg.com/locate-path/-/locate-path-5.0.0.tgz#1afba396afd676a6d42504d0a67a3a7eb9f62aa0" + integrity sha512-t7hw9pI+WvuwNJXwk5zVHpyhIqzg2qTlklJOf0mVxGSbe3Fp2VieZcduNYjaLDoy6p9uGpQEGWG87WpMKlNq8g== + dependencies: + p-locate "^4.1.0" + +mime@^2.0.3: + version "2.4.6" + resolved "https://registry.yarnpkg.com/mime/-/mime-2.4.6.tgz#e5b407c90db442f2beb5b162373d07b69affa4d1" + integrity sha512-RZKhC3EmpBchfTGBVb8fb+RL2cWyw/32lshnsETttkBAyAUXSGHxbEJWWRXc751DrIxG1q04b8QwMbAwkRPpUA== + +minimatch@^3.0.4: + version "3.0.4" + resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-3.0.4.tgz#5166e286457f03306064be5497e8dbb0c3d32083" + integrity sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA== + dependencies: + brace-expansion "^1.1.7" + +mkdirp-classic@^0.5.2: + version "0.5.3" + resolved "https://registry.yarnpkg.com/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz#fa10c9115cc6d8865be221ba47ee9bed78601113" + integrity sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A== + +ms@2.1.2: + version "2.1.2" + resolved "https://registry.yarnpkg.com/ms/-/ms-2.1.2.tgz#d09d1f357b443f493382a8eb3ccd183872ae6009" + integrity sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w== + +once@^1.3.0, once@^1.3.1, once@^1.4.0: + version "1.4.0" + resolved "https://registry.yarnpkg.com/once/-/once-1.4.0.tgz#583b1aa775961d4b113ac17d9c50baef9dd76bd1" + integrity sha1-WDsap3WWHUsROsF9nFC6753Xa9E= + dependencies: + wrappy "1" + +p-limit@^2.2.0: + version "2.3.0" + resolved "https://registry.yarnpkg.com/p-limit/-/p-limit-2.3.0.tgz#3dd33c647a214fdfffd835933eb086da0dc21db1" + integrity sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w== + dependencies: + p-try "^2.0.0" + +p-locate@^4.1.0: + version "4.1.0" + resolved "https://registry.yarnpkg.com/p-locate/-/p-locate-4.1.0.tgz#a3428bb7088b3a60292f66919278b7c297ad4f07" + integrity sha512-R79ZZ/0wAxKGu3oYMlz8jy/kbhsNrS7SKZ7PxEHBgJ5+F2mtFW2fK2cOtBh1cHYkQsbzFV7I+EoRKe6Yt0oK7A== + dependencies: + p-limit "^2.2.0" + +p-try@^2.0.0: + version "2.2.0" + resolved "https://registry.yarnpkg.com/p-try/-/p-try-2.2.0.tgz#cb2868540e313d61de58fafbe35ce9004d5540e6" + integrity sha512-R4nPAVTAU0B9D35/Gk3uJf/7XYbQcyohSKdvAxIRSNghFl4e71hVoGnBNQz9cWaXxO2I10KTC+3jMdvvoKw6dQ== + +path-exists@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/path-exists/-/path-exists-4.0.0.tgz#513bdbe2d3b95d7762e8c1137efa195c6c61b5b3" + integrity sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w== + +path-is-absolute@^1.0.0: + version "1.0.1" + resolved "https://registry.yarnpkg.com/path-is-absolute/-/path-is-absolute-1.0.1.tgz#174b9268735534ffbc7ace6bf53a5a9e1b5c5f5f" + integrity sha1-F0uSaHNVNP+8es5r9TpanhtcX18= + +pend@~1.2.0: + version "1.2.0" + resolved "https://registry.yarnpkg.com/pend/-/pend-1.2.0.tgz#7a57eb550a6783f9115331fcf4663d5c8e007a50" + integrity sha1-elfrVQpng/kRUzH89GY9XI4AelA= + +pkg-dir@^4.2.0: + version "4.2.0" + resolved "https://registry.yarnpkg.com/pkg-dir/-/pkg-dir-4.2.0.tgz#f099133df7ede422e81d1d8448270eeb3e4261f3" + integrity sha512-HRDzbaKjC+AOWVXxAU/x54COGeIv9eb+6CkDSQoNTt4XyWoIJvuPsXizxu/Fr23EiekbtZwmh1IcIG/l/a10GQ== + dependencies: + find-up "^4.0.0" + +progress@^2.0.1: + version "2.0.3" + resolved "https://registry.yarnpkg.com/progress/-/progress-2.0.3.tgz#7e8cf8d8f5b8f239c1bc68beb4eb78567d572ef8" + integrity sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA== + +proxy-from-env@^1.0.0: + version "1.1.0" + resolved "https://registry.yarnpkg.com/proxy-from-env/-/proxy-from-env-1.1.0.tgz#e102f16ca355424865755d2c9e8ea4f24d58c3e2" + integrity sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg== + +pump@^3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/pump/-/pump-3.0.0.tgz#b4a2116815bde2f4e1ea602354e8c75565107a64" + integrity sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww== + dependencies: + end-of-stream "^1.1.0" + once "^1.3.1" + +puppeteer-core@^5.3.0: + version "5.3.0" + resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-5.3.0.tgz#36d2e10132632c9cb73007f3f2880f4e6b655977" + integrity sha512-+4wk+0dcDNg7AQqN41Q9r41U6iltAtknuVBI0aj0O/Vp8/4orgbFV0wn55wV5xRae//CucLPUnaczxZx7dz0UA== + dependencies: + debug "^4.1.0" + devtools-protocol "0.0.799653" + extract-zip "^2.0.0" + https-proxy-agent "^4.0.0" + mime "^2.0.3" + pkg-dir "^4.2.0" + progress "^2.0.1" + proxy-from-env "^1.0.0" + rimraf "^3.0.2" + tar-fs "^2.0.0" + unbzip2-stream "^1.3.3" + ws "^7.2.3" + +readable-stream@^3.1.1, readable-stream@^3.4.0: + version "3.6.0" + resolved "https://registry.yarnpkg.com/readable-stream/-/readable-stream-3.6.0.tgz#337bbda3adc0706bd3e024426a286d4b4b2c9198" + integrity sha512-BViHy7LKeTz4oNnkcLJ+lVSL6vpiFeX6/d3oSH8zCW7UxP2onchk+vTGB143xuFjHS3deTgkKoXXymXqymiIdA== + dependencies: + inherits "^2.0.3" + string_decoder "^1.1.1" + util-deprecate "^1.0.1" + +rimraf@^3.0.2: + version "3.0.2" + resolved "https://registry.yarnpkg.com/rimraf/-/rimraf-3.0.2.tgz#f1a5402ba6220ad52cc1282bac1ae3aa49fd061a" + integrity sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA== + dependencies: + glob "^7.1.3" + +safe-buffer@~5.2.0: + version "5.2.1" + resolved "https://registry.yarnpkg.com/safe-buffer/-/safe-buffer-5.2.1.tgz#1eaf9fa9bdb1fdd4ec75f58f9cdb4e6b7827eec6" + integrity sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ== + +string_decoder@^1.1.1: + version "1.3.0" + resolved "https://registry.yarnpkg.com/string_decoder/-/string_decoder-1.3.0.tgz#42f114594a46cf1a8e30b0a84f56c78c3edac21e" + integrity sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA== + dependencies: + safe-buffer "~5.2.0" + +tar-fs@^2.0.0: + version "2.1.0" + resolved "https://registry.yarnpkg.com/tar-fs/-/tar-fs-2.1.0.tgz#d1cdd121ab465ee0eb9ccde2d35049d3f3daf0d5" + integrity sha512-9uW5iDvrIMCVpvasdFHW0wJPez0K4JnMZtsuIeDI7HyMGJNxmDZDOCQROr7lXyS+iL/QMpj07qcjGYTSdRFXUg== + dependencies: + chownr "^1.1.1" + mkdirp-classic "^0.5.2" + pump "^3.0.0" + tar-stream "^2.0.0" + +tar-stream@^2.0.0: + version "2.1.4" + resolved "https://registry.yarnpkg.com/tar-stream/-/tar-stream-2.1.4.tgz#c4fb1a11eb0da29b893a5b25476397ba2d053bfa" + integrity sha512-o3pS2zlG4gxr67GmFYBLlq+dM8gyRGUOvsrHclSkvtVtQbjV0s/+ZE8OpICbaj8clrX3tjeHngYGP7rweaBnuw== + dependencies: + bl "^4.0.3" + end-of-stream "^1.4.1" + fs-constants "^1.0.0" + inherits "^2.0.3" + readable-stream "^3.1.1" + +through@^2.3.8: + version "2.3.8" + resolved "https://registry.yarnpkg.com/through/-/through-2.3.8.tgz#0dd4c9ffaabc357960b1b724115d7e0e86a2e1f5" + integrity sha1-DdTJ/6q8NXlgsbckEV1+Doai4fU= + +unbzip2-stream@^1.3.3: + version "1.4.3" + resolved "https://registry.yarnpkg.com/unbzip2-stream/-/unbzip2-stream-1.4.3.tgz#b0da04c4371311df771cdc215e87f2130991ace7" + integrity sha512-mlExGW4w71ebDJviH16lQLtZS32VKqsSfk80GCfUlwT/4/hNRFsoscrF/c++9xinkMzECL1uL9DDwXqFWkruPg== + dependencies: + buffer "^5.2.1" + through "^2.3.8" + +util-deprecate@^1.0.1: + version "1.0.2" + resolved "https://registry.yarnpkg.com/util-deprecate/-/util-deprecate-1.0.2.tgz#450d4dc9fa70de732762fbd2d4a28981419a0ccf" + integrity sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8= + +wrappy@1: + version "1.0.2" + resolved "https://registry.yarnpkg.com/wrappy/-/wrappy-1.0.2.tgz#b5243d8f3ec1aa35f1364605bc0d1036e30ab69f" + integrity sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8= + +ws@^7.2.3: + version "7.3.1" + resolved "https://registry.yarnpkg.com/ws/-/ws-7.3.1.tgz#d0547bf67f7ce4f12a72dfe31262c68d7dc551c8" + integrity sha512-D3RuNkynyHmEJIpD2qrgVkc9DQ23OrN/moAwZX4L8DfvszsJxpjQuUq3LMx6HoYji9fbIOBY18XWBsAux1ZZUA== + +yauzl@^2.10.0: + version "2.10.0" + resolved "https://registry.yarnpkg.com/yauzl/-/yauzl-2.10.0.tgz#c7eb17c93e112cb1086fa6d8e51fb0667b79a5f9" + integrity sha1-x+sXyT4RLLEIb6bY5R+wZnt5pfk= + dependencies: + buffer-crc32 "~0.2.3" + fd-slicer "~1.1.0" From 1de577bd78a14b125f81568da27799a76433a547 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 19 Sep 2020 22:19:20 +0000 Subject: [PATCH 02/10] use puppeteeer-cluster for parallel crawling use yargs to parse command-line args --- README.md | 37 ++++++++++++++-- index.js | 85 +++++++++++++++++++++++++----------- package.json | 4 +- run.sh | 20 +++++++-- yarn.lock | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 232 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index 9122bc7..1aa0fbe 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,40 @@ zimit -=== +===== -Create ZIM files out of HTTP websites +This version of Zimit runs a single-site headless-Chrome based crawl in a Docker container and produces a ZIM of the crawled content. -# Previous version +The system uses: + - `oldwebtoday/chrome` - to install a recent version of Chrome 84 + - `puppeteer-cluster` - for running Chrome browsers in parallel + - `pywb` - in recording mode for capturing the content + - `warc2zim` - to convert the crawled WARC files into a ZIM + +The driver in `index.js` crawls a given URL using puppeteer-cluster. + +After the crawl is done, warc2zim is used to write a zim to the `/output` directory, which can be mounted as a volume. + +## Usage + +`zimit` is intended to be run in Docker. + +The following is an example usage. The `--cap-add` and `--shm-size` flags are needed for Chrome. + +The image accepts the following parameters: + +- "" - the url to be crawled (required) +- `--workers N` - number of crawl workers to be run in parallel +- `--wait-until` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example). + + +Example command: + +``` +docker run -d -e NAME=myzimfile -v /output:/output --cap-add=SYS_ADMIN --cap-add=NET_ADMIN --shm-size=1gb openzim/zimit "" --workers 2 --wait-until domcontentloaded +``` + +
+ +## Previous version A first version of a generic HTTP scraper was created in 2016 during the [Wikimania Esino Lario Hackathon](https://wikimania2016.wikimedia.org/wiki/Programme/Kiwix-dedicated_Hackathon). diff --git a/index.js b/index.js index e68ffdb..305c7e0 100644 --- a/index.js +++ b/index.js @@ -1,45 +1,80 @@ const puppeteer = require("puppeteer-core"); +const { Cluster } = require("puppeteer-cluster"); -const PAGE_TIMEOUT = 60000; +async function run(params) { + const args = [ + "--no-first-run", + "--no-xshm", + `--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}` + ]; -async function run() { - const defaultViewport = null; - const browserURL = `http://localhost:9222`; - let browser = null; + const puppeteerOptions = { + headless: true, + executablePath: "/usr/bin/google-chrome", + ignoreHTTPSErrors: true, + args + }; - console.log("waiting for browser..."); + const cluster = await Cluster.launch({ + concurrency: Cluster.CONCURRENCY_PAGE, + maxConcurrency: Number(params.workers) || 1, + skipDuplicateUrls: true, + puppeteerOptions, + puppeteer, + monitor: true + }); + + let seenList = new Set(); + const url = params._[0]; + + let { waitUntil, timeout, scope } = params; + waitUntil = waitUntil || "load"; + timeout = timeout || 60000; + scope = scope || new URL(url).origin + "/"; + + cluster.task(async ({page, data}) => { + const {url} = data; - while (!browser) { try { - browser = await puppeteer.connect({browserURL, defaultViewport}); + await page.goto(url, {waitUntil, timeout}); } catch (e) { - //console.log(e); - await new Promise((resolve) => setTimeout(resolve, 500)); + console.log(`Load timeout for ${url}`); } - } - console.log("connected!"); + try{ + const result = await page.evaluate(() => { + return [...document.querySelectorAll('a[href]')].map(el => ({ url: el.href})) + }); - const pages = await browser.pages(); - - const page = pages.length ? pages[0] : await browser.newPage(); + for (data of result) { + if (seenList.has(data.url)) { + continue; + } + //console.log(`check ${data.url} in ${allowedDomain}`); + if (scope && data.url.startsWith(scope)) { + seenList.add(data.url); + cluster.queue({url: data.url}); + } + } + } catch (e) { + console.warn("error"); + console.warn(e); + } + }); - console.log(process.argv); - const url = process.argv.length > 2 ? process.argv[2] : ""; + cluster.queue({url}); - if (!url) { - throw "No URL specified, exiting"; - } - - await page.goto(url, {"waitUntil": "networkidle0", "timeout": PAGE_TIMEOUT}); - - console.log("loaded!"); + await cluster.idle(); + await cluster.close(); } async function main() { + const params = require('yargs').argv; + console.log(params); + try { - await run(); + await run(params); process.exit(0); } catch(e) { console.log(e); diff --git a/package.json b/package.json index 1f4226d..93e01f5 100644 --- a/package.json +++ b/package.json @@ -6,6 +6,8 @@ "author": "Ilya Kreymer ", "license": "MIT", "dependencies": { - "puppeteer-core": "^5.3.0" + "puppeteer-cluster": "^0.22.0", + "puppeteer-core": "^5.3.0", + "yargs": "^16.0.3" } } diff --git a/run.sh b/run.sh index f15e6d6..e47d0ea 100755 --- a/run.sh +++ b/run.sh @@ -2,15 +2,27 @@ URL="$1" wb-manager init capture -uwsgi uwsgi.ini & +uwsgi uwsgi.ini &> /dev/null & -/browser/browser_entrypoint.sh /browser/run.sh & +#/browser/browser_entrypoint.sh /browser/run.sh & +#if [[ -n "$PROXY_CA_FILE" && -f "$PROXY_CA_FILE" && -n "$PROXY_HOST" ]]; then +# rm -rf "$HOME/.pki/nssdb" +# mkdir -p "$HOME/.pki/nssdb" +# certutil -d "$HOME/.pki/nssdb" -N +# certutil -d "sql:$HOME/.pki/nssdb" -A -t "C,," -n "Proxy" -i "$PROXY_CA_FILE" +# rm "$PROXY_CA_FILE" +#fi -node index.js "$URL" +#mkdir ~/.config/ +#mkdir ~/.config/google-chrome +#touch ~/.config/google-chrome/First\ Run + +export QT_X11_NO_MITSHM=1 + +node index.js "$@" NAME=${NAME:=zimfile} -stat /output warc2zim --url $URL --name $NAME --output=/output ./collections/capture/archive/*.warc.gz diff --git a/yarn.lock b/yarn.lock index f0f888d..039a6f7 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2,6 +2,11 @@ # yarn lockfile v1 +"@types/color-name@^1.1.1": + version "1.1.1" + resolved "https://registry.yarnpkg.com/@types/color-name/-/color-name-1.1.1.tgz#1c1261bbeaa10a8055bbc5d8ab84b7b2afc846a0" + integrity sha512-rr+OQyAjxze7GgWrSaJwydHStIhHq2lvY3BOC2Mj7KnzI7XK0Uw1TOOdI9lDoajEbSWLiYgoo4f1R51erQfhPQ== + "@types/node@*": version "14.11.1" resolved "https://registry.yarnpkg.com/@types/node/-/node-14.11.1.tgz#56af902ad157e763f9ba63d671c39cda3193c835" @@ -19,6 +24,19 @@ agent-base@5: resolved "https://registry.yarnpkg.com/agent-base/-/agent-base-5.1.1.tgz#e8fb3f242959db44d63be665db7a8e739537a32c" integrity sha512-TMeqbNl2fMW0nMjTEPOwe3J/PRFP4vqeoNuQMG0HlMrtm5QxKqdvAkZ1pRBQ/ulIyDD5Yq0nJ7YbdD8ey0TO3g== +ansi-regex@^5.0.0: + version "5.0.0" + resolved "https://registry.yarnpkg.com/ansi-regex/-/ansi-regex-5.0.0.tgz#388539f55179bf39339c81af30a654d69f87cb75" + integrity sha512-bY6fj56OUQ0hU1KjFNDQuJFezqKdrAyFdIevADiqrWHwSlbmBNMHp5ak2f40Pm8JTFyM2mqxkG6ngkHO11f/lg== + +ansi-styles@^4.0.0: + version "4.2.1" + resolved "https://registry.yarnpkg.com/ansi-styles/-/ansi-styles-4.2.1.tgz#90ae75c424d008d2624c5bf29ead3177ebfcf359" + integrity sha512-9VGjrMsG1vePxcSweQsN20KY/c4zN0h9fLjqAbwbPfahM3t+NL+M9HC8xeXG2I8pX5NoamTGNuomEUFI7fcUjA== + dependencies: + "@types/color-name" "^1.1.1" + color-convert "^2.0.1" + balanced-match@^1.0.0: version "1.0.0" resolved "https://registry.yarnpkg.com/balanced-match/-/balanced-match-1.0.0.tgz#89b4d199ab2bee49de164ea02b89ce462d71b767" @@ -64,6 +82,27 @@ chownr@^1.1.1: resolved "https://registry.yarnpkg.com/chownr/-/chownr-1.1.4.tgz#6fc9d7b42d32a583596337666e7d08084da2cc6b" integrity sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg== +cliui@^7.0.0: + version "7.0.1" + resolved "https://registry.yarnpkg.com/cliui/-/cliui-7.0.1.tgz#a4cb67aad45cd83d8d05128fc9f4d8fbb887e6b3" + integrity sha512-rcvHOWyGyid6I1WjT/3NatKj2kDt9OdSHSXpyLXaMWFbKpGACNW8pRhhdPUq9MWUOdwn8Rz9AVETjF4105rZZQ== + dependencies: + string-width "^4.2.0" + strip-ansi "^6.0.0" + wrap-ansi "^7.0.0" + +color-convert@^2.0.1: + version "2.0.1" + resolved "https://registry.yarnpkg.com/color-convert/-/color-convert-2.0.1.tgz#72d3a68d598c9bdb3af2ad1e84f21d896abd4de3" + integrity sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ== + dependencies: + color-name "~1.1.4" + +color-name@~1.1.4: + version "1.1.4" + resolved "https://registry.yarnpkg.com/color-name/-/color-name-1.1.4.tgz#c2a09a87acbde69543de6f63fa3995c826c536a2" + integrity sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA== + concat-map@0.0.1: version "0.0.1" resolved "https://registry.yarnpkg.com/concat-map/-/concat-map-0.0.1.tgz#d8a96bd77fd68df7793a73036a3ba0d5405d477b" @@ -81,6 +120,11 @@ devtools-protocol@0.0.799653: resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.799653.tgz#86fc95ce5bf4fdf4b77a58047ba9d2301078f119" integrity sha512-t1CcaZbvm8pOlikqrsIM9GOa7Ipp07+4h/q9u0JXBWjPCjHdBl9KkddX87Vv9vBHoBGtwV79sYQNGnQM6iS5gg== +emoji-regex@^8.0.0: + version "8.0.0" + resolved "https://registry.yarnpkg.com/emoji-regex/-/emoji-regex-8.0.0.tgz#e818fd69ce5ccfcb404594f842963bf53164cc37" + integrity sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A== + end-of-stream@^1.1.0, end-of-stream@^1.4.1: version "1.4.4" resolved "https://registry.yarnpkg.com/end-of-stream/-/end-of-stream-1.4.4.tgz#5ae64a5f45057baf3626ec14da0ca5e4b2431eb0" @@ -88,6 +132,11 @@ end-of-stream@^1.1.0, end-of-stream@^1.4.1: dependencies: once "^1.4.0" +escalade@^3.0.2: + version "3.1.0" + resolved "https://registry.yarnpkg.com/escalade/-/escalade-3.1.0.tgz#e8e2d7c7a8b76f6ee64c2181d6b8151441602d4e" + integrity sha512-mAk+hPSO8fLDkhV7V0dXazH5pDc6MrjBTPyD3VeKzxnVFjH1MIxbCdqGZB9O8+EwWakZs3ZCbDS4IpRt79V1ig== + extract-zip@^2.0.0: version "2.0.1" resolved "https://registry.yarnpkg.com/extract-zip/-/extract-zip-2.0.1.tgz#663dca56fe46df890d5f131ef4a06d22bb8ba13a" @@ -124,6 +173,11 @@ fs.realpath@^1.0.0: resolved "https://registry.yarnpkg.com/fs.realpath/-/fs.realpath-1.0.0.tgz#1504ad2523158caa40db4a2787cb01411994ea4f" integrity sha1-FQStJSMVjKpA20onh8sBQRmU6k8= +get-caller-file@^2.0.5: + version "2.0.5" + resolved "https://registry.yarnpkg.com/get-caller-file/-/get-caller-file-2.0.5.tgz#4f94412a82db32f36e3b0b9741f8a97feb031f7e" + integrity sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg== + get-stream@^5.1.0: version "5.2.0" resolved "https://registry.yarnpkg.com/get-stream/-/get-stream-5.2.0.tgz#4966a1795ee5ace65e706c4b7beb71257d6e22d3" @@ -169,6 +223,11 @@ inherits@2, inherits@^2.0.3, inherits@^2.0.4: resolved "https://registry.yarnpkg.com/inherits/-/inherits-2.0.4.tgz#0fa2c64f932917c3433a0ded55363aae37416b7c" integrity sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ== +is-fullwidth-code-point@^3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz#f116f8064fe90b3f7844a38997c0b75051269f1d" + integrity sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg== + locate-path@^5.0.0: version "5.0.0" resolved "https://registry.yarnpkg.com/locate-path/-/locate-path-5.0.0.tgz#1afba396afd676a6d42504d0a67a3a7eb9f62aa0" @@ -264,6 +323,13 @@ pump@^3.0.0: end-of-stream "^1.1.0" once "^1.3.1" +puppeteer-cluster@^0.22.0: + version "0.22.0" + resolved "https://registry.yarnpkg.com/puppeteer-cluster/-/puppeteer-cluster-0.22.0.tgz#4ab214671f414f15ad6a94a4b61ed0b4172e86e6" + integrity sha512-hmydtMwfVM+idFIDzS8OXetnujHGre7RY3BGL+3njy9+r8Dcu3VALkZHfuBEPf6byKssTCgzxU1BvLczifXd5w== + dependencies: + debug "^4.1.1" + puppeteer-core@^5.3.0: version "5.3.0" resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-5.3.0.tgz#36d2e10132632c9cb73007f3f2880f4e6b655977" @@ -291,6 +357,11 @@ readable-stream@^3.1.1, readable-stream@^3.4.0: string_decoder "^1.1.1" util-deprecate "^1.0.1" +require-directory@^2.1.1: + version "2.1.1" + resolved "https://registry.yarnpkg.com/require-directory/-/require-directory-2.1.1.tgz#8c64ad5fd30dab1c976e2344ffe7f792a6a6df42" + integrity sha1-jGStX9MNqxyXbiNE/+f3kqam30I= + rimraf@^3.0.2: version "3.0.2" resolved "https://registry.yarnpkg.com/rimraf/-/rimraf-3.0.2.tgz#f1a5402ba6220ad52cc1282bac1ae3aa49fd061a" @@ -303,6 +374,15 @@ safe-buffer@~5.2.0: resolved "https://registry.yarnpkg.com/safe-buffer/-/safe-buffer-5.2.1.tgz#1eaf9fa9bdb1fdd4ec75f58f9cdb4e6b7827eec6" integrity sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ== +string-width@^4.1.0, string-width@^4.2.0: + version "4.2.0" + resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.0.tgz#952182c46cc7b2c313d1596e623992bd163b72b5" + integrity sha512-zUz5JD+tgqtuDjMhwIg5uFVV3dtqZ9yQJlZVfq4I01/K5Paj5UHj7VyrQOJvzawSVlKpObApbfD0Ed6yJc+1eg== + dependencies: + emoji-regex "^8.0.0" + is-fullwidth-code-point "^3.0.0" + strip-ansi "^6.0.0" + string_decoder@^1.1.1: version "1.3.0" resolved "https://registry.yarnpkg.com/string_decoder/-/string_decoder-1.3.0.tgz#42f114594a46cf1a8e30b0a84f56c78c3edac21e" @@ -310,6 +390,13 @@ string_decoder@^1.1.1: dependencies: safe-buffer "~5.2.0" +strip-ansi@^6.0.0: + version "6.0.0" + resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.0.tgz#0b1571dd7669ccd4f3e06e14ef1eed26225ae532" + integrity sha512-AuvKTrTfQNYNIctbR1K/YGTR1756GycPsg7b9bdV9Duqur4gv6aKqHXah67Z8ImS7WEz5QVcOtlfW2rZEugt6w== + dependencies: + ansi-regex "^5.0.0" + tar-fs@^2.0.0: version "2.1.0" resolved "https://registry.yarnpkg.com/tar-fs/-/tar-fs-2.1.0.tgz#d1cdd121ab465ee0eb9ccde2d35049d3f3daf0d5" @@ -349,6 +436,15 @@ util-deprecate@^1.0.1: resolved "https://registry.yarnpkg.com/util-deprecate/-/util-deprecate-1.0.2.tgz#450d4dc9fa70de732762fbd2d4a28981419a0ccf" integrity sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8= +wrap-ansi@^7.0.0: + version "7.0.0" + resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43" + integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q== + dependencies: + ansi-styles "^4.0.0" + string-width "^4.1.0" + strip-ansi "^6.0.0" + wrappy@1: version "1.0.2" resolved "https://registry.yarnpkg.com/wrappy/-/wrappy-1.0.2.tgz#b5243d8f3ec1aa35f1364605bc0d1036e30ab69f" @@ -359,6 +455,29 @@ ws@^7.2.3: resolved "https://registry.yarnpkg.com/ws/-/ws-7.3.1.tgz#d0547bf67f7ce4f12a72dfe31262c68d7dc551c8" integrity sha512-D3RuNkynyHmEJIpD2qrgVkc9DQ23OrN/moAwZX4L8DfvszsJxpjQuUq3LMx6HoYji9fbIOBY18XWBsAux1ZZUA== +y18n@^5.0.1: + version "5.0.1" + resolved "https://registry.yarnpkg.com/y18n/-/y18n-5.0.1.tgz#1ad2a7eddfa8bce7caa2e1f6b5da96c39d99d571" + integrity sha512-/jJ831jEs4vGDbYPQp4yGKDYPSCCEQ45uZWJHE1AoYBzqdZi8+LDWas0z4HrmJXmKdpFsTiowSHXdxyFhpmdMg== + +yargs-parser@^20.0.0: + version "20.0.0" + resolved "https://registry.yarnpkg.com/yargs-parser/-/yargs-parser-20.0.0.tgz#c65a1daaa977ad63cebdd52159147b789a4e19a9" + integrity sha512-8eblPHTL7ZWRkyjIZJjnGf+TijiKJSwA24svzLRVvtgoi/RZiKa9fFQTrlx0OKLnyHSdt/enrdadji6WFfESVA== + +yargs@^16.0.3: + version "16.0.3" + resolved "https://registry.yarnpkg.com/yargs/-/yargs-16.0.3.tgz#7a919b9e43c90f80d4a142a89795e85399a7e54c" + integrity sha512-6+nLw8xa9uK1BOEOykaiYAJVh6/CjxWXK/q9b5FpRgNslt8s22F2xMBqVIKgCRjNgGvGPBy8Vog7WN7yh4amtA== + dependencies: + cliui "^7.0.0" + escalade "^3.0.2" + get-caller-file "^2.0.5" + require-directory "^2.1.1" + string-width "^4.2.0" + y18n "^5.0.1" + yargs-parser "^20.0.0" + yauzl@^2.10.0: version "2.10.0" resolved "https://registry.yarnpkg.com/yauzl/-/yauzl-2.10.0.tgz#c7eb17c93e112cb1086fa6d8e51fb0667b79a5f9" From 4e04645e6b5bf4a75b563c146557523d23c8be31 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 19 Sep 2020 22:47:19 +0000 Subject: [PATCH 03/10] move warc2zim to be launched by node process --- README.md | 5 ++++- index.js | 12 ++++++++++++ run.sh | 21 +-------------------- 3 files changed, 17 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 1aa0fbe..4c47f89 100644 --- a/README.md +++ b/README.md @@ -24,12 +24,15 @@ The image accepts the following parameters: - "" - the url to be crawled (required) - `--workers N` - number of crawl workers to be run in parallel - `--wait-until` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example). +- `--name` - Name of ZIM file (defaults to the hostname of the URL) +- `--output` - output directory (defaults to `/output`) + Example command: ``` -docker run -d -e NAME=myzimfile -v /output:/output --cap-add=SYS_ADMIN --cap-add=NET_ADMIN --shm-size=1gb openzim/zimit "" --workers 2 --wait-until domcontentloaded +docker run -v /output:/output --cap-add=SYS_ADMIN --cap-add=NET_ADMIN --shm-size=1gb openzim/zimit "" --name myzimfile --workers 2 --wait-until domcontentloaded ```
diff --git a/index.js b/index.js index 305c7e0..a49beeb 100644 --- a/index.js +++ b/index.js @@ -1,5 +1,6 @@ const puppeteer = require("puppeteer-core"); const { Cluster } = require("puppeteer-cluster"); +const child_process = require("child_process"); async function run(params) { const args = [ @@ -66,6 +67,17 @@ async function run(params) { await cluster.idle(); await cluster.close(); + + const zimName = params.name || new URL(url).hostname; + const zimOutput = params.output || "/output"; + + const warc2zim = `warc2zim --url ${url} --name ${zimName} --output ${zimOutput} ./collections/capture/archive/\*.warc.gz`; + + console.log("Running: " + warc2zim); + + //await new Promise((resolve) => { + child_process.execSync(warc2zim, {shell: "/bin/bash", stdio: "inherit", stderr: "inherit"}); + //}); } diff --git a/run.sh b/run.sh index e47d0ea..b9bd9fc 100755 --- a/run.sh +++ b/run.sh @@ -4,26 +4,7 @@ URL="$1" wb-manager init capture uwsgi uwsgi.ini &> /dev/null & -#/browser/browser_entrypoint.sh /browser/run.sh & -#if [[ -n "$PROXY_CA_FILE" && -f "$PROXY_CA_FILE" && -n "$PROXY_HOST" ]]; then -# rm -rf "$HOME/.pki/nssdb" -# mkdir -p "$HOME/.pki/nssdb" -# certutil -d "$HOME/.pki/nssdb" -N -# certutil -d "sql:$HOME/.pki/nssdb" -A -t "C,," -n "Proxy" -i "$PROXY_CA_FILE" -# rm "$PROXY_CA_FILE" -#fi - -#mkdir ~/.config/ -#mkdir ~/.config/google-chrome -#touch ~/.config/google-chrome/First\ Run - +# needed for chrome export QT_X11_NO_MITSHM=1 node index.js "$@" - -NAME=${NAME:=zimfile} - - -warc2zim --url $URL --name $NAME --output=/output ./collections/capture/archive/*.warc.gz - - From 9b23de828be02cef592827151a99a36e3654fb9f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 19 Sep 2020 15:53:23 -0700 Subject: [PATCH 04/10] Update README.md --- README.md | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 4c47f89..0823579 100644 --- a/README.md +++ b/README.md @@ -17,24 +17,32 @@ After the crawl is done, warc2zim is used to write a zim to the `/output` direct `zimit` is intended to be run in Docker. -The following is an example usage. The `--cap-add` and `--shm-size` flags are needed for Chrome. +To build locally run: + +``` +docker build -t openzim/zimit . +``` The image accepts the following parameters: -- "" - the url to be crawled (required) +- `URL` - the url to be crawled (required) - `--workers N` - number of crawl workers to be run in parallel - `--wait-until` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example). - `--name` - Name of ZIM file (defaults to the hostname of the URL) - `--output` - output directory (defaults to `/output`) - +The following is an example usage. The `--cap-add` and `--shm-size` flags are needed to run Chrome in Docker. Example command: ``` -docker run -v /output:/output --cap-add=SYS_ADMIN --cap-add=NET_ADMIN --shm-size=1gb openzim/zimit "" --name myzimfile --workers 2 --wait-until domcontentloaded +docker run -v /output:/output --cap-add=SYS_ADMIN --cap-add=NET_ADMIN --shm-size=1gb openzim/zimit URL --name myzimfile --workers 2 --wait-until domcontentloaded ``` +The puppeteer-cluster provides monitoring output which is enabled by default and prints the crawl status to the Docker log. + + +
## Previous version From ff2773677ca17c76544d44dbe057d23852644e94 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 19 Sep 2020 23:18:15 +0000 Subject: [PATCH 05/10] crawling: move checking logic to shouldCrawl, remove hashtag before checking seen list --- index.js | 56 +++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 9 deletions(-) diff --git a/index.js b/index.js index a49beeb..a5a2103 100644 --- a/index.js +++ b/index.js @@ -3,12 +3,13 @@ const { Cluster } = require("puppeteer-cluster"); const child_process = require("child_process"); async function run(params) { + // Chrome Flags, including proxy server const args = [ - "--no-first-run", - "--no-xshm", + "--no-xshm", // needed for Chrome >80 (check if puppeteer adds automatically) `--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}` ]; + // Puppeter Options const puppeteerOptions = { headless: true, executablePath: "/usr/bin/google-chrome", @@ -16,6 +17,7 @@ async function run(params) { args }; + // Puppeteer Cluster init and options const cluster = await Cluster.launch({ concurrency: Cluster.CONCURRENCY_PAGE, maxConcurrency: Number(params.workers) || 1, @@ -25,14 +27,22 @@ async function run(params) { monitor: true }); + // Maintain own seen list let seenList = new Set(); const url = params._[0]; let { waitUntil, timeout, scope } = params; + + // waitUntil condition (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options) waitUntil = waitUntil || "load"; + + // Timeout per page timeout = timeout || 60000; + + // Scope for crawl, default to the domain of the URL scope = scope || new URL(url).origin + "/"; + // Crawl Task cluster.task(async ({page, data}) => { const {url} = data; @@ -48,13 +58,10 @@ async function run(params) { }); for (data of result) { - if (seenList.has(data.url)) { - continue; - } - //console.log(`check ${data.url} in ${allowedDomain}`); - if (scope && data.url.startsWith(scope)) { - seenList.add(data.url); - cluster.queue({url: data.url}); + const newUrl = shouldCrawl(scope, seenList, data.url); + if (newUrl) { + seenList.add(newUrl); + cluster.queue({url: newUrl}); } } } catch (e) { @@ -81,6 +88,37 @@ async function run(params) { } +function shouldCrawl(scope, seenList, url) { + try { + url = new URL(url); + } catch(e) { + return false; + } + + // remove hashtag + url.hash = ""; + + // only queue http/https URLs + if (url.protocol != "http:" && url.protocol != "https:") { + return false; + } + + url = url.href; + + // skip already crawled + if (seenList.has(url)) { + return false; + } + + // if scope is provided, skip urls not in scope + if (scope && !url.startsWith(scope)) { + return false; + } + + return url; +} + + async function main() { const params = require('yargs').argv; console.log(params); From b00c4262a7af9c88178a1c22a30dd9b0bd0e320f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 21 Sep 2020 07:14:23 +0000 Subject: [PATCH 06/10] add --limit param for max URLs to be captured add 'html check', only load HTML in browsers, load other content-types directly via pywb, esp for PDFs (work on #8) improved error handling --- Dockerfile | 4 +-- README.md | 1 + index.js | 87 ++++++++++++++++++++++++++++++++++++++++++++++------ package.json | 2 ++ run.sh | 8 +++-- yarn.lock | 17 ++++++++++ 6 files changed, 105 insertions(+), 14 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2a9360d..af6955c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -28,18 +28,18 @@ RUN useradd zimit --shell /bin/bash --create-home \ WORKDIR /app -ADD index.js /app/ ADD package.json /app/ RUN chown -R zimit /app -USER zimit +#USER zimit RUN yarn install ADD config.yaml /app/ ADD uwsgi.ini /app/ ADD run.sh /app/ +ADD index.js /app/ ENTRYPOINT ["/app/run.sh"] diff --git a/README.md b/README.md index 0823579..5a23eff 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,7 @@ The image accepts the following parameters: - `--wait-until` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example). - `--name` - Name of ZIM file (defaults to the hostname of the URL) - `--output` - output directory (defaults to `/output`) +- `--limit U` - Limit capture to at most U URLs The following is an example usage. The `--cap-add` and `--shm-size` flags are needed to run Chrome in Docker. diff --git a/index.js b/index.js index a5a2103..d43eddd 100644 --- a/index.js +++ b/index.js @@ -1,6 +1,11 @@ const puppeteer = require("puppeteer-core"); const { Cluster } = require("puppeteer-cluster"); const child_process = require("child_process"); +const fetch = require("node-fetch"); +const AbortController = require("abort-controller"); + +const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"]; + async function run(params) { // Chrome Flags, including proxy server @@ -9,6 +14,9 @@ async function run(params) { `--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}` ]; + // prefix for direct capture via pywb + const capturePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/capture/record/id_/`; + // Puppeter Options const puppeteerOptions = { headless: true, @@ -19,7 +27,7 @@ async function run(params) { // Puppeteer Cluster init and options const cluster = await Cluster.launch({ - concurrency: Cluster.CONCURRENCY_PAGE, + concurrency: Cluster.CONCURRENCY_CONTEXT, maxConcurrency: Number(params.workers) || 1, skipDuplicateUrls: true, puppeteerOptions, @@ -31,45 +39,68 @@ async function run(params) { let seenList = new Set(); const url = params._[0]; - let { waitUntil, timeout, scope } = params; + let { waitUntil, timeout, scope, limit } = params; // waitUntil condition (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options) waitUntil = waitUntil || "load"; // Timeout per page - timeout = timeout || 60000; + timeout = Number(timeout) || 60000; // Scope for crawl, default to the domain of the URL scope = scope || new URL(url).origin + "/"; + // Limit number of pages captured + limit = Number(limit) || 0; + + console.log("Limit: " + limit); + + // links crawled counter + let numLinks = 0; + // Crawl Task cluster.task(async ({page, data}) => { const {url} = data; + if (!await htmlCheck(url, capturePrefix)) { + return; + } + try { await page.goto(url, {waitUntil, timeout}); } catch (e) { console.log(`Load timeout for ${url}`); } - try{ - const result = await page.evaluate(() => { + let results = null; + + try { + results = await page.evaluate(() => { return [...document.querySelectorAll('a[href]')].map(el => ({ url: el.href})) }); + } catch (e) { + console.warn("Link Extraction failed", e); + return; + } - for (data of result) { + try { + for (data of results) { const newUrl = shouldCrawl(scope, seenList, data.url); + if (newUrl) { seenList.add(newUrl); + if (numLinks++ >= limit && limit > 0) { + break; + } cluster.queue({url: newUrl}); } } } catch (e) { - console.warn("error"); - console.warn(e); + console.log("Queuing Error: " + e); } }); + numLinks++; cluster.queue({url}); await cluster.idle(); @@ -118,6 +149,43 @@ function shouldCrawl(scope, seenList, url) { return url; } +async function htmlCheck(url, capturePrefix) { + try { + const resp = await fetch(url, {method: "HEAD"}); + + if (resp.status >= 400) { + console.log(`Skipping ${url}, invalid status ${resp.status}`); + return false; + } + + const contentType = resp.headers.get("Content-Type"); + + // just load if no content-type + if (!contentType) { + return true; + } + + const mime = contentType.split(";")[0]; + + if (HTML_TYPES.includes(mime)) { + return true; + } + + // capture directly + console.log(`Direct capture: ${capturePrefix}${url}`); + const abort = new AbortController(); + const signal = abort.signal; + const resp2 = await fetch(capturePrefix + url, {signal}); + abort.abort(); + + return false; + } catch(e) { + console.log("HTML Check error", e); + // can't confirm not html, so try in browser + return true; + } +} + async function main() { const params = require('yargs').argv; @@ -127,7 +195,8 @@ async function main() { await run(params); process.exit(0); } catch(e) { - console.log(e); + console.error("Crawl failed, ZIM creation skipped"); + console.error(e); process.exit(1); } } diff --git a/package.json b/package.json index 93e01f5..c304660 100644 --- a/package.json +++ b/package.json @@ -6,6 +6,8 @@ "author": "Ilya Kreymer ", "license": "MIT", "dependencies": { + "abort-controller": "^3.0.0", + "node-fetch": "^2.6.1", "puppeteer-cluster": "^0.22.0", "puppeteer-core": "^5.3.0", "yargs": "^16.0.3" diff --git a/run.sh b/run.sh index b9bd9fc..74d0752 100755 --- a/run.sh +++ b/run.sh @@ -1,10 +1,12 @@ #!/bin/bash -URL="$1" - wb-manager init capture uwsgi uwsgi.ini &> /dev/null & # needed for chrome export QT_X11_NO_MITSHM=1 -node index.js "$@" +cmd="$@" + +su zimit -c "node index.js $cmd" + + diff --git a/yarn.lock b/yarn.lock index 039a6f7..d793a9a 100644 --- a/yarn.lock +++ b/yarn.lock @@ -19,6 +19,13 @@ dependencies: "@types/node" "*" +abort-controller@^3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/abort-controller/-/abort-controller-3.0.0.tgz#eaf54d53b62bae4138e809ca225c8439a6efb392" + integrity sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg== + dependencies: + event-target-shim "^5.0.0" + agent-base@5: version "5.1.1" resolved "https://registry.yarnpkg.com/agent-base/-/agent-base-5.1.1.tgz#e8fb3f242959db44d63be665db7a8e739537a32c" @@ -137,6 +144,11 @@ escalade@^3.0.2: resolved "https://registry.yarnpkg.com/escalade/-/escalade-3.1.0.tgz#e8e2d7c7a8b76f6ee64c2181d6b8151441602d4e" integrity sha512-mAk+hPSO8fLDkhV7V0dXazH5pDc6MrjBTPyD3VeKzxnVFjH1MIxbCdqGZB9O8+EwWakZs3ZCbDS4IpRt79V1ig== +event-target-shim@^5.0.0: + version "5.0.1" + resolved "https://registry.yarnpkg.com/event-target-shim/-/event-target-shim-5.0.1.tgz#5d4d3ebdf9583d63a5333ce2deb7480ab2b05789" + integrity sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ== + extract-zip@^2.0.0: version "2.0.1" resolved "https://registry.yarnpkg.com/extract-zip/-/extract-zip-2.0.1.tgz#663dca56fe46df890d5f131ef4a06d22bb8ba13a" @@ -257,6 +269,11 @@ ms@2.1.2: resolved "https://registry.yarnpkg.com/ms/-/ms-2.1.2.tgz#d09d1f357b443f493382a8eb3ccd183872ae6009" integrity sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w== +node-fetch@^2.6.1: + version "2.6.1" + resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.1.tgz#045bd323631f76ed2e2b55573394416b639a0052" + integrity sha512-V4aYg89jEoVRxRb2fJdAg8FHvI7cEyYdVAh94HH0UIK8oJxUfkjlDQN9RbMx+bEjP7+ggMiFRprSti032Oipxw== + once@^1.3.0, once@^1.3.1, once@^1.4.0: version "1.4.0" resolved "https://registry.yarnpkg.com/once/-/once-1.4.0.tgz#583b1aa775961d4b113ac17d9c50baef9dd76bd1" From f2522459833249076edb77292db1ef7979f6cb1d Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 22 Sep 2020 06:09:33 +0000 Subject: [PATCH 07/10] try using regular puppeteer, only copy deps from chrome image pywb: increase uwsgi processes, disable autoindex/autofetch for better perf --- Dockerfile | 12 ++++++------ config.yaml | 4 ++-- index.js | 6 +++--- package.json | 2 +- uwsgi.ini | 3 ++- yarn.lock | 6 +++--- 6 files changed, 17 insertions(+), 16 deletions(-) diff --git a/Dockerfile b/Dockerfile index af6955c..d51dfb6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,16 +10,16 @@ ENV PROXY_HOST=localhost \ RUN pip install pywb uwsgi warc2zim -COPY --from=chrome /opt/google/chrome/ /opt/google/chrome/ +#COPY --from=chrome /opt/google/chrome/ /opt/google/chrome/ -COPY --from=chrome /app/ /browser/ +#COPY --from=chrome /app/ /browser/ COPY --from=chrome /usr/lib/x86_64-linux-gnu/ /usr/lib/x86_64-linux-gnu/ COPY --from=chrome /lib/x86_64-linux-gnu/libdbus* /lib/x86_64-linux-gnu/ -COPY --from=chrome /usr/bin/run_forever /usr/bin/ -COPY --from=chrome /usr/bin/wget /usr/bin/ -COPY --from=chrome /usr/bin/certutil /usr/bin/ +#COPY --from=chrome /usr/bin/run_forever /usr/bin/ +#COPY --from=chrome /usr/bin/wget /usr/bin/ +#COPY --from=chrome /usr/bin/certutil /usr/bin/ -RUN ln -s /opt/google/chrome/google-chrome /usr/bin/google-chrome +#RUN ln -s /opt/google/chrome/google-chrome /usr/bin/google-chrome RUN useradd zimit --shell /bin/bash --create-home \ && usermod -a -G sudo zimit \ diff --git a/config.yaml b/config.yaml index 2250640..a072f5a 100644 --- a/config.yaml +++ b/config.yaml @@ -9,9 +9,9 @@ proxy: recorder: live -autoindex: 10 +#autoindex: 10 -enable_auto_fetch: true +#enable_auto_fetch: true collections: live: $live diff --git a/index.js b/index.js index d43eddd..812f6ae 100644 --- a/index.js +++ b/index.js @@ -1,4 +1,4 @@ -const puppeteer = require("puppeteer-core"); +const puppeteer = require("puppeteer"); const { Cluster } = require("puppeteer-cluster"); const child_process = require("child_process"); const fetch = require("node-fetch"); @@ -20,14 +20,14 @@ async function run(params) { // Puppeter Options const puppeteerOptions = { headless: true, - executablePath: "/usr/bin/google-chrome", + //executablePath: "/usr/bin/google-chrome", ignoreHTTPSErrors: true, args }; // Puppeteer Cluster init and options const cluster = await Cluster.launch({ - concurrency: Cluster.CONCURRENCY_CONTEXT, + concurrency: Cluster.CONCURRENCY_PAGE, maxConcurrency: Number(params.workers) || 1, skipDuplicateUrls: true, puppeteerOptions, diff --git a/package.json b/package.json index c304660..47efcbb 100644 --- a/package.json +++ b/package.json @@ -8,8 +8,8 @@ "dependencies": { "abort-controller": "^3.0.0", "node-fetch": "^2.6.1", + "puppeteer": "^5.3.0", "puppeteer-cluster": "^0.22.0", - "puppeteer-core": "^5.3.0", "yargs": "^16.0.3" } } diff --git a/uwsgi.ini b/uwsgi.ini index f415aa3..1ded14e 100644 --- a/uwsgi.ini +++ b/uwsgi.ini @@ -12,13 +12,14 @@ if-env = VIRTUAL_ENV venv = $(VIRTUAL_ENV) endif = -gevent = 100 +gevent = 200 #Not available until uwsgi 2.1 #monkey-patching manually in pywb.apps.wayback #gevent-early-monkey-patch = # for uwsgi<2.1, set env when using gevent env = GEVENT_MONKEY_PATCH=1 +processes = 8 # specify config file here env = PYWB_CONFIG_FILE=config.yaml diff --git a/yarn.lock b/yarn.lock index d793a9a..458e423 100644 --- a/yarn.lock +++ b/yarn.lock @@ -347,10 +347,10 @@ puppeteer-cluster@^0.22.0: dependencies: debug "^4.1.1" -puppeteer-core@^5.3.0: +puppeteer@^5.3.0: version "5.3.0" - resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-5.3.0.tgz#36d2e10132632c9cb73007f3f2880f4e6b655977" - integrity sha512-+4wk+0dcDNg7AQqN41Q9r41U6iltAtknuVBI0aj0O/Vp8/4orgbFV0wn55wV5xRae//CucLPUnaczxZx7dz0UA== + resolved "https://registry.yarnpkg.com/puppeteer/-/puppeteer-5.3.0.tgz#0abf83d0f2d1273baf2b56885a813f8052903e33" + integrity sha512-GjqMk5GRro3TO0sw3QMsF1H7n+/jaK2OW45qMvqjYUyJ7y4oA//9auy969HHhTG3HZXaMxY/NWXF/NXlAFIvtw== dependencies: debug "^4.1.0" devtools-protocol "0.0.799653" From f25b390f15f4929e41ebad98da4008fd86abab92 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 22 Sep 2020 17:48:09 +0000 Subject: [PATCH 08/10] add regex exclusions --- README.md | 1 + index.js | 22 +++++++++++++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 5a23eff..f42d6df 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,7 @@ The image accepts the following parameters: - `--name` - Name of ZIM file (defaults to the hostname of the URL) - `--output` - output directory (defaults to `/output`) - `--limit U` - Limit capture to at most U URLs +- `--exclude ` - skip URLs that match the regex from crawling. Can be specified multiple times. The following is an example usage. The `--cap-add` and `--shm-size` flags are needed to run Chrome in Docker. diff --git a/index.js b/index.js index 812f6ae..9f9bbb4 100644 --- a/index.js +++ b/index.js @@ -39,7 +39,7 @@ async function run(params) { let seenList = new Set(); const url = params._[0]; - let { waitUntil, timeout, scope, limit } = params; + let { waitUntil, timeout, scope, limit, exclude } = params; // waitUntil condition (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options) waitUntil = waitUntil || "load"; @@ -53,6 +53,14 @@ async function run(params) { // Limit number of pages captured limit = Number(limit) || 0; + if (exclude) { + if (typeof(exclude) === "string") { + exclude = [new RegExp(exclude)]; + } else { + exclude = exclude.map(e => new RegExp(e)); + } + } + console.log("Limit: " + limit); // links crawled counter @@ -85,7 +93,7 @@ async function run(params) { try { for (data of results) { - const newUrl = shouldCrawl(scope, seenList, data.url); + const newUrl = shouldCrawl(scope, seenList, data.url, exclude); if (newUrl) { seenList.add(newUrl); @@ -119,7 +127,7 @@ async function run(params) { } -function shouldCrawl(scope, seenList, url) { +function shouldCrawl(scope, seenList, url, exclude) { try { url = new URL(url); } catch(e) { @@ -146,6 +154,14 @@ function shouldCrawl(scope, seenList, url) { return false; } + // check exclusions + for (const e of exclude) { + if (e.exec(url)) { + //console.log(`Skipping ${url} excluded by ${e}`); + return false; + } + } + return url; } From 6a925748d557df6f28527533353d6ab36976a473 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 22 Sep 2020 18:12:15 +0000 Subject: [PATCH 09/10] excludes: fix no exclude default --- index.js | 2 ++ 1 file changed, 2 insertions(+) diff --git a/index.js b/index.js index 9f9bbb4..0f4103b 100644 --- a/index.js +++ b/index.js @@ -59,6 +59,8 @@ async function run(params) { } else { exclude = exclude.map(e => new RegExp(e)); } + } else { + exclude = []; } console.log("Limit: " + limit); From 71e94914aa07f0aa2bfc77d33ea9982d18779099 Mon Sep 17 00:00:00 2001 From: renaud gaudin Date: Wed, 23 Sep 2020 08:42:08 +0000 Subject: [PATCH 10/10] Added gevent update to prevent segfault in uwsgi --- Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Dockerfile b/Dockerfile index d51dfb6..432b827 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,6 +9,8 @@ ENV PROXY_HOST=localhost \ NO_SOCAT=1 RUN pip install pywb uwsgi warc2zim +# force reinstall of gevent to prevent segfault on uwsgi worker +RUN pip install -U gevent #COPY --from=chrome /opt/google/chrome/ /opt/google/chrome/