kiwix-js-pwa/www/js/lib/transformZimit.js
2023-11-10 10:11:30 +00:00

363 lines
20 KiB
JavaScript

/**
* transformZimit.js: Functions to enable reading of Zimit ZIM format.
*
* Copyright 2022 Jaifroid, Mossroy and contributors.
* Licence: GPL v3.
*
* This file is part of Kiwix.
*
* Kiwix is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public Licence as published by
* the Free Software Foundation, either version 3 of the Licence, or
* (at your option) any later version.
*
* Kiwix is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public Licence for more details.
*
* You should have received a copy of the GNU General Public Licence
* along with Kiwix (file LICENSE-GPLv3.txt). If not, see <http://www.gnu.org/licenses/>.
*/
/**
* transformZimit.js: Functions to enable reading of Zimit ZIM format.
*
* Copyright 2022 Jaifroid, Mossroy and contributors.
* Licence: GPL v3.
*
* This file is part of Kiwix.
*
* Kiwix is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public Licence as published by
* the Free Software Foundation, either version 3 of the Licence, or
* (at your option) any later version.
*
* Kiwix is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public Licence for more details.
*
* You should have received a copy of the GNU General Public Licence
* along with Kiwix (file LICENSE-GPLv3.txt). If not, see <http://www.gnu.org/licenses/>.
*/
'use strict';
import uiUtil from './uiUtil.js';
/* global appstate, params */
/**
* Filters out the Replay system files (since these cannot be loaded alongside a Service Worker without error)
* In the case of the H prefix or the landing page, an 'inspect' property is added to the dirEntry, so that we can discover
* the underlying Zimit landing page below
* @param {dirEntry} dirEntry The directory entry to modify or anull
* @returns {dirEntry} The modified directory entry
*/
function filterReplayFiles (dirEntry) {
if (!(dirEntry && dirEntry.url)) return null;
if (dirEntry.namespace === 'H' || dirEntry.namespace === 'C' && /^H\//.test(dirEntry.url) ||
params.isLandingPage && /^(A\/)?index\.html(?:[?#]|$)/.test(dirEntry.url)) {
dirEntry.inspect = true;
}
if (/(?:\bload\.js|\bsw\.js|analytics.*\.js|update\.googleapis|play\.google.*(?:stats|logs)|youtube\.com.*\/stats|google\.internal|syndication|survey\.js|yuiloader\.js|doubleclick|play\.google\.|developer\.mozilla\.org\/static\/js\/main\..+\.js)(?:[?#]|$)/i.test(dirEntry.url)) {
dirEntry.nullify = true;
}
return dirEntry;
}
/**
* Inspects the HTML of the ZIM archive's landing page or of the requested Header to discover the URL to redirect to
* adds a custom redirect to the dirEntry
* @param {dirEntry} dirEntry The directory entry of the landing page or H-prefixed header to process
* @param {String} data The decoded data which the dirEntry points to
* @param {String} cns The Content Name Space of the ZIM (usually 'C' or 'A')
* @returns {dirEntry} The modified directory entry
*/
function getZimitRedirect (dirEntry, data, cns) {
var redirect;
// Type 1 ZIMs don't use the H namespace, and instead use H as prefix to the URL
if (dirEntry.namespace === 'H' || cns === 'C' && /^H\//.test(dirEntry.url)) {
// We are dealing with a Header redirect, so we need to find the Location: field
redirect = data.match(/^Location:\s*https?:\/\/([^/]+)(.*)$/m);
if (!redirect) redirect = data.match(/^WARC-Target-URI:\s*https?:\/\/([^/]+)(.*)$/m)
if (redirect && redirect[1]) {
// Type 1 Zimit ZIMs need intermediary 'A' prefix, since there is no longer any A namespace
params.zimitPrefix = (cns === 'C' ? 'A/' : '') + redirect[1];
dirEntry.zimitRedirect = cns + '/' + params.zimitPrefix + redirect[2];
} else {
dirEntry.zimitRedirect = null;
}
} else if (/301\s*moved\s+permanently/i.test(data)) {
redirect = data.match(/moved\s+permanently(?:[^<]|<(?!a\s))+<a\s[^"']+["'](?:https?:)?\/?\/?([^"']+)/i);
if (redirect && redirect[1]) {
// Remove any port
var zimitRedirect = redirect[1].replace(/^([^/]+):\d+(\/)/, '$1$2');
dirEntry.zimitRedirect = cns + '/' + (cns === 'C' ? 'A/' : '') + zimitRedirect;
}
console.debug('*** Asset moved permanently! Redirecting to: ' + dirEntry.zimitRedirect + ' ***');
} else {
redirect = data.match(/window\.mainUrl\s*=\s*(['"])https?:\/\/([^/]+)(.+?)\1/);
if (redirect && redirect[2] && redirect[3]) {
// Logic added for Type 1 Zimit ZIMs
params.zimitPrefix = (dirEntry.namespace === 'C' ? 'A/' : '') + redirect[2];
params.zimitStartPage = dirEntry.namespace + '/' + params.zimitPrefix + redirect[3];
} else {
params.zimitStartPage = null;
}
dirEntry.zimitRedirect = params.zimitStartPage;
}
return dirEntry;
}
/**
* Establish some Regular Expressions used by the transformReplayUrls function
*/
var regexpZimitHtmlLinks = /(<(?:a|img|script|link|track|meta|iframe)\b[^>]*?[\s;])(?:src\b|href|url)\s*(=\s*(["']))(?=[./]+|https?)((?:[^>](?!\3|\?|#))+[^>])([^>]*>)/ig;
var regexpZimitJavascriptLinks = /['"(]((?=[^'"?#)]+\.(?:com?\b|net\b|org\b))|(?:(?:https?:)?\/\/)[^'"?#)]*)['"?#)]/ig;
var regexpZimitCssLinks = /\burl\s*\(['"\s]*([^)'"\s]+)['"\s]*\)/ig;
var regexpGetZimitPrefix = /link\s+rel=["']canonical["']\s+href="https?:\/\/([^/"]+)/i;
var regexpRemoveAnalytics1 = /<script\b([^<]|<(?!\/script>))+?(?:google.*?analytics|adsbygoogle|googleads|doubleclick|pubads|syndication)([^<]|<(?!\/script>))+<\/script>\s*/ig;
var regexpRemoveAnalytics2 = /<ins\b(?:[^<]|<(?!\/ins>))+?adsbygoogle(?:[^<]|<(?!\/ins>))+<\/ins>\s*/ig;
var regexpInlineScriptsNotMaths = /<(script\b(?![^>]+type\s*=\s*["'](?:math\/|text\/html|[^"']*?math))(?:[^<]|<(?!\/script>))+<\/script)>/ig;
/**
* The main function for transforming Zimit URLs into standard ZIM URLs.
* @param {dirEntry} dirEntry The directory entry that points to the extracted data
* @param {String} data The deocmpressed and extracted textual data that the dirEntry points to
* @param {String} mimetype The reported mimetype of the data (this is also in the dirEntry)
* @returns {String} The transformed data string
*/
function transformReplayUrls (dirEntry, data, mimetype) {
/**
* Transform URL links in HTML files
* Note that some Zimit ZIMs have mimeteypes like 'text/html;raw=true', so we can't simply match 'text/html'
* Other ZIMs have a mimetype like 'html' (with no 'text/'), so we have to match as generically as possible
*/
// console.debug('**** Transforming URLs in ' + dirEntry.namespace + '/' + dirEntry.url + ' ****');
var indexRoot = window.location.pathname.replace(/[^/]+$/, '') + encodeURI(appstate.selectedArchive.file.name);
if (/\bx?html\b/i.test(mimetype)) {
var zimitPrefix = data.match(regexpGetZimitPrefix);
// If the URL is the same as the URL with everything after the first / removed, then we are in the root directory
// We use this to decide whether to remove any relative link prefixes like ../
var rootDirectory = dirEntry.url === dirEntry.url.replace(/^((?:A\/)?[^/]+\/?).*/, '$1');
params.zimitPrefix = zimitPrefix ? (dirEntry.namespace === 'C' ? 'A/' : '') + zimitPrefix[1] : params.zimitPrefix;
// Remove lazyimgage system and noscript tags that comment out images
// DEV: Check if this is still necessary
data = data.replace(/<noscript>\s*(<img\b[^>]+>)\s*<\/noscript>/ig, '$1');
data = data.replace(/<span\b[^>]+lazy-image-placeholder[^<]+<\/span>\s*/ig, '');
// Remove meta http-equiv refresh from assets
if (dirEntry.isAsset) data = data.replace(/<meta\s+http-equiv[^>]+refresh\b[^>]+>\s*/i, '');
// // Inject the helper script wombat.js
// data = data.replace(/(<\/head>\s*)/i, '<script src="https://' + params.zimitPrefix + '/static/wombat.js"></script>\n');
// Get stem for constructing an absolute URL
data = data.replace(regexpZimitHtmlLinks, function (match, blockStart, equals, quote, relAssetUrl, blockClose) {
var newBlock = match;
var assetUrl = relAssetUrl;
// console.log('Asset URL: ' + assetUrl);
// Remove google analytics and other analytics files that cause stall
if (/analytics|typepad.*stats|googleads|doubleclick|syndication/i.test(assetUrl)) return '';
// For root-relative links, we need to add the zimitPrefix
assetUrl = assetUrl.replace(/^\/(?!\/)/, indexRoot + '/' + dirEntry.namespace + '/' + params.zimitPrefix + '/');
// For Zimit assets that begin with https: or // the zimitPrefix is derived from the URL
assetUrl = assetUrl.replace(/^(?:https?:)?\/\//i, indexRoot + '/' + dirEntry.namespace + '/' + (dirEntry.namespace === 'C' ? 'A/' : ''));
// For fully relative links, we have to remove any '..' if we are in root directory
if (rootDirectory) assetUrl = assetUrl.replace(/^(\.\.\/?)+/, indexRoot + '/' + dirEntry.namespace + '/' + params.zimitPrefix + '/');
// Add placeholder to prevent further transformations
if (/^<a\s/i.test(newBlock)) newBlock = newBlock.replace(relAssetUrl, '@kiwixtrans@' + assetUrl + (params.contentInjectionMode === 'serviceworker' ? '?isKiwixHref' : ''));
// But for non-anchor URLs, We have to mark potential assets that are not easily identified as assets, due to so many html mimetypes being returned for them
else newBlock = newBlock.replace(relAssetUrl, '@kiwixtransformed@' + assetUrl);
// console.debug('Transform: \n' + match + '\n -> ' + newBlock);
return newBlock;
});
// Deal with image srcsets
data = data.replace(/<img\b[^>]+srcset=["']([^"']+)/ig, function (match, srcset) {
var srcsetArr = srcset.split(',');
for (var i = 0; i < srcsetArr.length; i++) {
// For root-relative links, we need to add the zimitPrefix
srcsetArr[i] = srcsetArr[i].replace(/^\s?\/(?!\/)/, indexRoot + '/' + dirEntry.namespace + '/' + params.zimitPrefix + '/');
// Zimit prefix is in the URL for absolute URLs
srcsetArr[i] = srcsetArr[i].replace(/^(?:\s?https?:)?\/\//i, indexRoot + '/' + dirEntry.namespace + '/' + (dirEntry.namespace === 'C' ? 'A/' : ''));
if (rootDirectory) srcsetArr[i] = srcsetArr[i].replace(/^(\.\.\/?)+/, indexRoot + '/' + dirEntry.namespace + '/' + params.zimitPrefix + '/');
srcsetArr[i] = '@kiwixtransformed@' + srcsetArr[i];
}
match = match.replace(srcset, srcsetArr.join(', '));
return match;
});
// Deal with regex-style urls embedded in page
data = data.replace(/https?:\\\/\\\/[^"']+/gi, function (assetUrl) {
assetUrl = assetUrl.replace(/^https?:\\\/\\\//i, '\\/' + dirEntry.namespace + '\\/' + (dirEntry.namespace === 'C' ? 'A\\/' : ''));
assetUrl = (indexRoot).replace(/\\/g, '\\\\').replace(/\//g, '\\/') + assetUrl;
return assetUrl;
});
// Remove any <base href...> statements
// DEV: You should probably deal with this more intelligently, changing absolute links rather than just removing,
// but so far, removing it seems to do the job
data = data.replace(/<base\b[^>]+href\b[^>]+>\s*/i, '');
// Remove any residual analytics and ads
data = data.replace(regexpRemoveAnalytics1, '');
data = data.replace(regexpRemoveAnalytics2, '');
// ZIM-specific overrides
// Deal with YouTube embedded keys
var youTubeKey = data.match(/INNERTUBE_API_KEY['":]+([^'"]+)/);
if (youTubeKey && youTubeKey[1]) {
var videoId = data.match(/originalUrl['":]+[^'"]+?youtube.com\/embed\/([^'"]+)/);
if (videoId && videoId[1]) {
var rgxYouTubeKey = new RegExp(youTubeKey[1].replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'g');
data = data.replace(rgxYouTubeKey, videoId[1]);
}
}
if (/(?:journals\.openedition\.org)/i.test(params.zimitPrefix)) {
// DEV: Checked still necessary as of 8-6-2022
// Neutralize all inline scripts, excluding math blocks or react templates, as they cause a loop on loading article
data = data.replace(regexpInlineScriptsNotMaths, function (p0, p1) {
return '<!-- ' + p1 + ' --!>';
});
// data = data.replace(/<script\b[^>]+tarteaucitron[^"']*?\.js(?:[^<]|<(?!\/script>))+<\/script>\s*/i, '');
}
// Remove shopping cart that attempts to post to server or scripts that take a very long time to fail and block page
if (/passco/i.test(params.zimitPrefix)) {
data = data.replace(/<script\b[^>]+(?:cart-fragments|lp-global\.min\.js)(?:[^<]|<(?!\/script>))+<\/script>\s*/, '');
}
} // End of html transformations
/**
* Transform css-style links in stylesheet files and stylesheet blocks in HTML
*/
if (/\b(css|x?html)\b/i.test(mimetype)) {
data = data.replace(regexpZimitCssLinks, function (match, url) {
var newBlock = match;
var assetUrl = url;
// For root-relative links, we need to add the zimitPrefix
assetUrl = assetUrl.replace(/^\/(?!\/)/, indexRoot + '/' + dirEntry.namespace + '/' + params.zimitPrefix + '/');
// Deal with absolute URLs
assetUrl = assetUrl.replace(/^(https?:)?\/\//i, indexRoot + '/' + dirEntry.namespace + '/' + (dirEntry.namespace === 'C' ? 'A/' : ''));
if (rootDirectory) assetUrl = assetUrl.replace(/^(\.\.\/?)+/, indexRoot + '/' + dirEntry.namespace + '/' + params.zimitPrefix + '/');
// Relative assets
newBlock = assetUrl === url ? newBlock
: newBlock.replace(url, '@kiwixtransformed@' + assetUrl);
// console.debug('Transform: \n' + match + '\n -> ' + newBlock);
return newBlock;
});
} // End of css transformations
/**
* Transform links in JavaScript files or script blocks in the html
*/
if (/\b(javascript|x?html)\b/i.test(mimetype)) {
data = data.replace(regexpZimitJavascriptLinks, function (match, url) {
if (/www\.w3\.org\/XML\//i.test(url)) return match;
var newBlock = match;
var assetUrl = url;
assetUrl = assetUrl.replace(/^\/(?!\/)/, indexRoot + '/' + dirEntry.namespace + '/' + params.zimitPrefix + '/');
assetUrl = assetUrl.replace(/^\/\//, indexRoot + '/' + dirEntry.namespace + '/' + (dirEntry.namespace === 'C' ? 'A/' : ''));
assetUrl = assetUrl.replace(/^https?:\/\//i, indexRoot + '/' + dirEntry.namespace + '/' + (dirEntry.namespace === 'C' ? 'A/' : ''));
// Remove analytics
assetUrl = /analytics|(typepad|api).*stats|googleads|doubleclick|syndication|jnn-pa\.googleapis|play\.google\.com/i.test(assetUrl) ? '' : assetUrl;
// Relative assets
newBlock = newBlock.replace(url, '@kiwixtransformed@' + assetUrl);
// console.debug('Transform: \n' + match + '\n -> ' + newBlock);
return newBlock;
});
data = data.replace(/(['"])(?:\/?)((?:static|api)\/)/ig, '$1' + indexRoot + '/' + dirEntry.namespace + '/' + params.zimitPrefix + '/$2');
} // End of JavaScript transformations
// Remove the placeholders used to prevent further matching
data = data.replace(/@kiwixtransformed@(?!\.)/g, '');
data = data.replace(/@kiwixtrans(?:formed)?@/g, '');
return data;
}
/**
* Transform video URL through fuzzy matching
* Rules adapted from https://github.com/webrecorder/wabac.js/blob/main/src/fuzzymatcher.js
* @param {String} url The URL to transform through fuzzy matching
* @param {Function} callback The function to call with the transformed url
*/
function transformVideoUrl (url, articleDocument, callback) {
if (/youtu(?:be(?:-nocookie)?\.com|\.be)/i.test(url)) {
var cns = appstate.selectedArchive.getContentNamespace();
var rgxTrimUrl = new RegExp('(?:[^/]|\\/(?!' + cns + '\\/))+\\/');
var pureUrl = url.replace(rgxTrimUrl, '');
// See https://webapps.stackexchange.com/questions/54443/format-for-id-of-youtube-video for explanation of format
var videoId = pureUrl.match(/(?:videoid=|watch\?v=|embed\/|\/)([a-zA-Z0-9_-]{10}[048AEIMQUYcgkosw])(?:[&?#%]|\s*$)/i);
videoId = videoId ? videoId[1] : null;
if (!videoId) {
callback(url);
return
};
var prefix = (cns === 'C' ? cns + '/' : '') + 'H/www.youtube.com/ptracking';
// Set up regular expression search of URL index (aka fuzzy search)
var search = {
rgxPrefix: new RegExp('.*' + videoId, 'i'),
searchUrlIndex: true,
size: 1
}
appstate.selectedArchive.findDirEntriesWithPrefixCaseSensitive(prefix, search, function (dirEntry) {
if (dirEntry && dirEntry[0] && dirEntry[0].url) {
dirEntry = dirEntry[0];
var cpn = dirEntry.url.match(/cpn=([^&]+)/i);
cpn = cpn ? cpn[1] : null;
var ei = dirEntry.url.match(/ei=([^&]+)/i);
ei = ei ? ei[1] : null;
if (cpn || ei) {
prefix = (cns === 'C' ? cns + '/' : '') + 'A/rr';
search = {
rgxPrefix: new RegExp('.*' + (ei ? 'ei=' + ei : '') + (cpn ? '.*cpn=' + cpn : ''), 'i'),
searchUrlIndex: true,
size: 1
}
appstate.selectedArchive.findDirEntriesWithPrefixCaseSensitive(prefix, search, function (dirEntry) {
if (dirEntry && dirEntry[0] && dirEntry[0].url && !search.found) {
dirEntry = dirEntry[0];
search.found = true;
var transUrl = url.replace(pureUrl, dirEntry.namespace + '/' + dirEntry.url);
console.debug('TRANSFORMED VIDEO URL ' + pureUrl + ' --> \n' + transUrl);
// If we are dealing with embedded video, we have to find the embedded URL and subsitute it
if (/\/embed\//i.test(pureUrl)) {
var indexRoot = window.location.pathname.replace(/[^/]+$/, '') + encodeURI(appstate.selectedArchive.file.name);
Array.prototype.slice.call(articleDocument.querySelectorAll('iframe')).forEach(function (frame) {
if (~frame.src.indexOf(videoId)) {
var newUrl = window.location.origin + indexRoot + transUrl.replace(/videoembed/, '');
frame.src = newUrl;
}
});
}
callback(transUrl);
}
}, null); // null prevents callbacks with incomplete results
} else {
callback(url);
}
} else {
callback(url);
if (/youtube\.com\/embed\//i.test(pureUrl)) {
var anchor = {
protocol: 'https:',
href: 'https://www.youtube.com/watch?v=' + videoId,
type: 'video'
}
uiUtil.warnAndOpenExternalLinkInNewTab(null, anchor, 'This video is not available offline in this ZIM. To view online, please open the following URL');
}
}
}, null);
} else {
callback(url);
}
}
export default {
filterReplayFiles: filterReplayFiles,
getZimitRedirect: getZimitRedirect,
transformReplayUrls: transformReplayUrls,
transformVideoUrl: transformVideoUrl
};