/** * transformZimit.js: Functions to enable reading of Zimit ZIM format. * * Copyright 2022 Jaifroid, Mossroy and contributors. * Licence: GPL v3. * * This file is part of Kiwix. * * Kiwix is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public Licence as published by * the Free Software Foundation, either version 3 of the Licence, or * (at your option) any later version. * * Kiwix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public Licence for more details. * * You should have received a copy of the GNU General Public Licence * along with Kiwix (file LICENSE-GPLv3.txt). If not, see . */ /** * transformZimit.js: Functions to enable reading of Zimit ZIM format. * * Copyright 2022 Jaifroid, Mossroy and contributors. * Licence: GPL v3. * * This file is part of Kiwix. * * Kiwix is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public Licence as published by * the Free Software Foundation, either version 3 of the Licence, or * (at your option) any later version. * * Kiwix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public Licence for more details. * * You should have received a copy of the GNU General Public Licence * along with Kiwix (file LICENSE-GPLv3.txt). If not, see . */ 'use strict'; import uiUtil from './uiUtil.js'; /* global appstate, params */ /** * Filters out the Replay system files (since these cannot be loaded alongside a Service Worker without error) * In the case of the H prefix or the landing page, an 'inspect' property is added to the dirEntry, so that we can discover * the underlying Zimit landing page below * @param {dirEntry} dirEntry The directory entry to modify or anull * @returns {dirEntry} The modified directory entry */ function filterReplayFiles (dirEntry) { if (!(dirEntry && dirEntry.url)) return null; if (dirEntry.namespace === 'H' || dirEntry.namespace === 'C' && /^H\//.test(dirEntry.url) || params.isLandingPage && /^(A\/)?index\.html(?:[?#]|$)/.test(dirEntry.url)) { dirEntry.inspect = true; } if (/(?:\bload\.js|\bsw\.js|analytics.*\.js|update\.googleapis|play\.google.*(?:stats|logs)|youtube\.com.*\/stats|google\.internal|syndication|survey\.js|yuiloader\.js|doubleclick|play\.google\.|developer\.mozilla\.org\/static\/js\/main\..+\.js|webpushr.*\.com.*\.js)(?:[?#]|$)/i.test(dirEntry.url)) { dirEntry.nullify = true; } return dirEntry; } /** * Inspects the HTML of the ZIM archive's landing page or of the requested Header to discover the URL to redirect to * adds a custom redirect to the dirEntry * @param {dirEntry} dirEntry The directory entry of the landing page or H-prefixed header to process * @param {String} data The decoded data which the dirEntry points to * @param {String} cns The Content Name Space of the ZIM (usually 'C' or 'A') * @returns {dirEntry} The modified directory entry */ function getZimitRedirect (dirEntry, data, cns) { var redirect; // Type 1 ZIMs don't use the H namespace, and instead use H as prefix to the URL if (dirEntry.namespace === 'H' || cns === 'C' && /^H\//.test(dirEntry.url)) { // We are dealing with a Header redirect, so we need to find the Location: field redirect = data.match(/^Location:\s*https?:\/\/([^/]+)(.*)$/m); if (!redirect) redirect = data.match(/^WARC-Target-URI:\s*https?:\/\/([^/]+)(.*)$/m) if (redirect && redirect[1]) { // Type 1 Zimit ZIMs need intermediary 'A' prefix, since there is no longer any A namespace params.zimitPrefix = (cns === 'C' ? 'A/' : '') + redirect[1]; dirEntry.zimitRedirect = cns + '/' + params.zimitPrefix + redirect[2]; } else { dirEntry.zimitRedirect = null; } } else if (/301\s*moved\s+permanently/i.test(data)) { redirect = data.match(/moved\s+permanently(?:[^<]|<(?!a\s))+]*?[\s;])(?:src\b|href|url)\s*(=\s*(["']))(?=[./]+|https?)((?:[^>](?!\3|\?|#))+[^>])([^>]*>)/ig; var regexpZimitJavascriptLinks = /['"(]((?=[^'"?#)]+\.(?:com?\b|net\b|org\b))|(?:(?:https?:)?\/\/)[^'"?#)]*)['"?#)]/ig; var regexpZimitCssLinks = /\burl\s*\(['"\s]*([^)'"\s]+)['"\s]*\)/ig; var regexpGetZimitPrefix = /link\s+rel=["']canonical["']\s+href="https?:\/\/([^/"]+)/i; var regexpRemoveAnalytics1 = /))+?(?:google.*?analytics|adsbygoogle|googleads|doubleclick|pubads|syndication)([^<]|<(?!\/script>))+<\/script>\s*/ig; var regexpRemoveAnalytics2 = /))+?adsbygoogle(?:[^<]|<(?!\/ins>))+<\/ins>\s*/ig; var regexpInlineScriptsNotMaths = /<(script\b(?![^>]+type\s*=\s*["'](?:math\/|text\/html|[^"']*?math))(?:[^<]|<(?!\/script>))+<\/script)>/ig; /** * The main function for transforming Zimit URLs into standard ZIM URLs. * @param {dirEntry} dirEntry The directory entry that points to the extracted data * @param {String} data The deocmpressed and extracted textual data that the dirEntry points to * @param {String} mimetype The reported mimetype of the data (this is also in the dirEntry) * @returns {String} The transformed data string */ function transformReplayUrls (dirEntry, data, mimetype) { /** * Transform URL links in HTML files * Note that some Zimit ZIMs have mimeteypes like 'text/html;raw=true', so we can't simply match 'text/html' * Other ZIMs have a mimetype like 'html' (with no 'text/'), so we have to match as generically as possible */ // console.debug('**** Transforming URLs in ' + dirEntry.namespace + '/' + dirEntry.url + ' ****'); var indexRoot = window.location.pathname.replace(/[^/]+$/, '') + encodeURI(appstate.selectedArchive.file.name); if (/\bx?html\b/i.test(mimetype)) { var zimitPrefix = data.match(regexpGetZimitPrefix); // If the URL is the same as the URL with everything after the first / removed, then we are in the root directory // We use this to decide whether to remove any relative link prefixes like ../ var rootDirectory = dirEntry.url === dirEntry.url.replace(/^((?:A\/)?[^/]+\/?).*/, '$1'); params.zimitPrefix = zimitPrefix ? (dirEntry.namespace === 'C' ? 'A/' : '') + zimitPrefix[1] : params.zimitPrefix; // Remove lazyimgage system and noscript tags that comment out images // DEV: Check if this is still necessary data = data.replace(/