kiwix-js-pwa/www/js/lib/transformZimit.js

/**
 * transformZimit.js: Functions to enable reading of Zimit ZIM format.
 *
 * Copyright 2022 Jaifroid, Mossroy and contributors.
 * Licence: GPL v3.
 *
 * This file is part of Kiwix.
 *
 * Kiwix is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public Licence as published by
 * the Free Software Foundation, either version 3 of the Licence, or
 * (at your option) any later version.
 *
 * Kiwix is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public Licence for more details.
 *
 * You should have received a copy of the GNU General Public Licence
 * along with Kiwix (file LICENSE-GPLv3.txt). If not, see <http://www.gnu.org/licenses/>.
 */
/**
 * transformZimit.js: Functions to enable reading of Zimit ZIM format.
 *
 * Copyright 2022 Jaifroid, Mossroy and contributors.
 * Licence: GPL v3.
 *
 * This file is part of Kiwix.
 *
 * Kiwix is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public Licence as published by
 * the Free Software Foundation, either version 3 of the Licence, or
 * (at your option) any later version.
 *
 * Kiwix is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public Licence for more details.
 *
 * You should have received a copy of the GNU General Public Licence
 * along with Kiwix (file LICENSE-GPLv3.txt). If not, see <http://www.gnu.org/licenses/>.
 */
'use strict';

import uiUtil from './uiUtil.js';

/* global appstate, params */

/**
 * Filters out the Replay system files (since these cannot be loaded alongside a Service Worker without error)
 * In the case of the H prefix or the landing page, an 'inspect' property is added to the dirEntry, so that we can discover
 * the underlying Zimit landing page below
 * @param {dirEntry} dirEntry The directory entry to modify or anull
 * @returns {dirEntry} The modified directory entry
 */
function filterReplayFiles (dirEntry) {
    if (!(dirEntry && dirEntry.url)) return null;
    if (dirEntry.namespace === 'H' || dirEntry.namespace === 'C' && /^H\//.test(dirEntry.url) ||
    params.isLandingPage && /^(A\/)?index\.html(?:[?#]|$)/.test(dirEntry.url)) {
        dirEntry.inspect = true;
    }
    if (/(?:\bload\.js|\bsw\.js|analytics.*\.js|update\.googleapis|play\.google.*(?:stats|logs)|youtube\.com.*\/stats|google\.internal|syndication|survey\.js|yuiloader\.js|doubleclick|play\.google\.|developer\.mozilla\.org\/static\/js\/main\..+\.js)(?:[?#]|$)/i.test(dirEntry.url)) {
        dirEntry.nullify = true;
    }
    return dirEntry;
}

/**
 * Inspects the HTML of the ZIM archive's landing page or of the requested Header to discover the URL to redirect to
 * adds a custom redirect to the dirEntry
 * @param {dirEntry} dirEntry The directory entry of the landing page or H-prefixed header to process
 * @param {String} data The decoded data which the dirEntry points to
 * @param {String} cns The Content Name Space of the ZIM (usually 'C' or 'A')
 * @returns {dirEntry} The modified directory entry
 */
function getZimitRedirect (dirEntry, data, cns) {
    var redirect;
    // Type 1 ZIMs don't use the H namespace, and instead use H as prefix to the URL
    if (dirEntry.namespace === 'H' || cns === 'C' && /^H\//.test(dirEntry.url)) {
        // We are dealing with a Header redirect, so we need to find the Location: field
        redirect = data.match(/^Location:\s*https?:\/\/([^/]+)(.*)$/m);
        if (!redirect) redirect = data.match(/^WARC-Target-URI:\s*https?:\/\/([^/]+)(.*)$/m)
        if (redirect && redirect[1]) {
            // Type 1 Zimit ZIMs need intermediary 'A' prefix, since there is no longer any A namespace
            params.zimitPrefix = (cns === 'C' ? 'A/' : '') + redirect[1];
            dirEntry.zimitRedirect = cns + '/' + params.zimitPrefix + redirect[2];
        } else {
            dirEntry.zimitRedirect = null;
        }
    } else if (/301\s*moved\s+permanently/i.test(data)) {
        redirect = data.match(/moved\s+permanently(?:[^<]|<(?!a\s))+<a\s[^"']+["'](?:https?:)?\/?\/?([^"']+)/i);
        if (redirect && redirect[1]) {
            // Remove any port
            var zimitRedirect = redirect[1].replace(/^([^/]+):\d+(\/)/, '$1$2');
            dirEntry.zimitRedirect = cns + '/' + (cns === 'C' ? 'A/' : '') + zimitRedirect;
        }
        console.debug('*** Asset moved permanently! Redirecting to: ' + dirEntry.zimitRedirect + ' ***');
    } else {
        redirect = data.match(/window\.mainUrl\s*=\s*(['"])https?:\/\/([^/]+)(.+?)\1/);
        if (redirect && redirect[2] && redirect[3]) {
            // Logic added for Type 1 Zimit ZIMs
            params.zimitPrefix = (dirEntry.namespace === 'C' ? 'A/' : '') + redirect[2];
            params.zimitStartPage = dirEntry.namespace + '/' + params.zimitPrefix + redirect[3];
        } else {
            params.zimitStartPage = null;
        }
        dirEntry.zimitRedirect = params.zimitStartPage;
    }
    return dirEntry;
}

/**
 * Establish some Regular Expressions used by the transformReplayUrls function
 */
var regexpZimitHtmlLinks = /(<(?:a|img|script|link|track|meta|iframe)\b[^>]*?[\s;])(?:src\b|href|url)\s*(=\s*(["']))(?=[./]+|https?)((?:[^>](?!\3|\?|#))+[^>])([^>]*>)/ig;
var regexpZimitJavascriptLinks = /['"(]((?=[^'"?#)]+\.(?:com?\b|net\b|org\b))|(?:(?:https?:)?\/\/)[^'"?#)]*)['"?#)]/ig;
var regexpZimitCssLinks = /\burl\s*\(['"\s]*([^)'"\s]+)['"\s]*\)/ig;
var regexpGetZimitPrefix = /link\s+rel=["']canonical["']\s+href="https?:\/\/([^/"]+)/i;
var regexpRemoveAnalytics1 = /<script\b([^<]|<(?!\/script>))+?(?:google.*?analytics|adsbygoogle|googleads|doubleclick|pubads|syndication)([^<]|<(?!\/script>))+<\/script>\s*/ig;
var regexpRemoveAnalytics2 = /<ins\b(?:[^<]|<(?!\/ins>))+?adsbygoogle(?:[^<]|<(?!\/ins>))+<\/ins>\s*/ig;
var regexpInlineScriptsNotMaths = /<(script\b(?![^>]+type\s*=\s*["'](?:math\/|text\/html|[^"']*?math))(?:[^<]|<(?!\/script>))+<\/script)>/ig;

/**
 * The main function for transforming Zimit URLs into standard ZIM URLs.
 * @param {dirEntry} dirEntry The directory entry that points to the extracted data
 * @param {String} data The deocmpressed and extracted textual data that the dirEntry points to
 * @param {String} mimetype The reported mimetype of the data (this is also in the dirEntry)
 * @returns {String} The transformed data string
 */
function transformReplayUrls (dirEntry, data, mimetype) {
    /**
     * Transform URL links in HTML files
     * Note that some Zimit ZIMs have mimeteypes like 'text/html;raw=true', so we can't simply match 'text/html'
     * Other ZIMs have a mimetype like 'html' (with no 'text/'), so we have to match as generically as possible
     */
    // console.debug('**** Transforming URLs in ' + dirEntry.namespace + '/' + dirEntry.url + ' ****');
    var indexRoot = window.location.pathname.replace(/[^/]+$/, '') + encodeURI(appstate.selectedArchive.file.name);
    if (/\bx?html\b/i.test(mimetype)) {
        var zimitPrefix = data.match(regexpGetZimitPrefix);
        // If the URL is the same as the URL with everything after the first / removed, then we are in the root directory
        // We use this to decide whether to remove any relative link prefixes like ../
        var rootDirectory = dirEntry.url === dirEntry.url.replace(/^((?:A\/)?[^/]+\/?).*/, '$1');
        params.zimitPrefix = zimitPrefix ? (dirEntry.namespace === 'C' ? 'A/' : '') + zimitPrefix[1] : params.zimitPrefix;
        // Remove lazyimgage system and noscript tags that comment out images
        // DEV: Check if this is still necessary
        data = data.replace(/<noscript>\s*(<img\b[^>]+>)\s*<\/noscript>/ig, '$1');
        data = data.replace(/<span\b[^>]+lazy-image-placeholder[^<]+<\/span>\s*/ig, '');
        // Remove meta http-equiv refresh from assets
        if (dirEntry.isAsset) data = data.replace(/<meta\s+http-equiv[^>]+refresh\b[^>]+>\s*/i, '');
        // // Inject the helper script wombat.js
        // data = data.replace(/(<\/head>\s*)/i, '<script src="https://' + params.zimitPrefix + '/static/wombat.js"></script>\n');

        // Get stem for constructing an absolute URL
        data = data.replace(regexpZimitHtmlLinks, function (match, blockStart, equals, quote, relAssetUrl, blockClose) {
            var newBlock = match;
            var assetUrl = relAssetUrl;
            // console.log('Asset URL: ' + assetUrl);
            // Remove google analytics and other analytics files that cause stall
            if (/analytics|typepad.*stats|googleads|doubleclick|syndication/i.test(assetUrl)) return '';
            // For root-relative links, we need to add the zimitPrefix
            assetUrl = assetUrl.replace(/^\/(?!\/)/, indexRoot + '/' + dirEntry.namespace + '/' + params.zimitPrefix + '/');
            // For Zimit assets that begin with https: or // the zimitPrefix is derived from the URL
            assetUrl = assetUrl.replace(/^(?:https?:)?\/\//i, indexRoot + '/' + dirEntry.namespace + '/' + (dirEntry.namespace === 'C' ? 'A/' : ''));
            // For fully relative links, we have to remove any '..' if we are in root directory
            if (rootDirectory) assetUrl = assetUrl.replace(/^(\.\.\/?)+/, indexRoot + '/' + dirEntry.namespace + '/' + params.zimitPrefix + '/');
            // Add placeholder to prevent further transformations
            if (/^<a\s/i.test(newBlock)) newBlock = newBlock.replace(relAssetUrl, '@kiwixtrans@' + assetUrl + (params.contentInjectionMode === 'serviceworker' ? '?isKiwixHref' : ''));
            // But for non-anchor URLs, We have to mark potential assets that are not easily identified as assets, due to so many html mimetypes being returned for them
            else newBlock = newBlock.replace(relAssetUrl, '@kiwixtransformed@' + assetUrl);
            // console.debug('Transform: \n' + match + '\n -> ' + newBlock);
            return newBlock;
        });

        // Deal with image srcsets
        data = data.replace(/<img\b[^>]+srcset=["']([^"']+)/ig, function (match, srcset) {
            var srcsetArr = srcset.split(',');
            for (var i = 0; i < srcsetArr.length; i++) {
                // For root-relative links, we need to add the zimitPrefix
                srcsetArr[i] = srcsetArr[i].replace(/^\s?\/(?!\/)/, indexRoot + '/' + dirEntry.namespace + '/' + params.zimitPrefix + '/');
                // Zimit prefix is in the URL for absolute URLs
                srcsetArr[i] = srcsetArr[i].replace(/^(?:\s?https?:)?\/\//i, indexRoot + '/' + dirEntry.namespace + '/' + (dirEntry.namespace === 'C' ? 'A/' : ''));
                if (rootDirectory) srcsetArr[i] = srcsetArr[i].replace(/^(\.\.\/?)+/, indexRoot + '/' + dirEntry.namespace + '/' + params.zimitPrefix + '/');
                srcsetArr[i] = '@kiwixtransformed@' + srcsetArr[i];
            }
            match = match.replace(srcset, srcsetArr.join(', '));
            return match;
        });

        // Deal with regex-style urls embedded in page
        data = data.replace(/https?:\\\/\\\/[^"']+/gi, function (assetUrl) {
            assetUrl = assetUrl.replace(/^https?:\\\/\\\//i, '\\/' + dirEntry.namespace + '\\/' + (dirEntry.namespace === 'C' ? 'A\\/' : ''));
            assetUrl = (indexRoot).replace(/\\/g, '\\\\').replace(/\//g, '\\/') + assetUrl;
            return assetUrl;
        });

        // Remove any <base href...> statements
        // DEV: You should probably deal with this more intelligently, changing absolute links rather than just removing,
        // but so far, removing it seems to do the job
        data = data.replace(/<base\b[^>]+href\b[^>]+>\s*/i, '');

        // Remove any residual analytics and ads
        data = data.replace(regexpRemoveAnalytics1, '');
        data = data.replace(regexpRemoveAnalytics2, '');

        // ZIM-specific overrides
        // Deal with YouTube embedded keys
        var youTubeKey = data.match(/INNERTUBE_API_KEY['":]+([^'"]+)/);
        if (youTubeKey && youTubeKey[1]) {
            var videoId = data.match(/originalUrl['":]+[^'"]+?youtube.com\/embed\/([^'"]+)/);
            if (videoId && videoId[1]) {
                var rgxYouTubeKey = new RegExp(youTubeKey[1].replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'g');
                data = data.replace(rgxYouTubeKey, videoId[1]);
            }
        }

        if (/(?:journals\.openedition\.org)/i.test(params.zimitPrefix)) {
            // DEV: Checked still necessary as of 8-6-2022
            // Neutralize all inline scripts, excluding math blocks or react templates, as they cause a loop on loading article
            data = data.replace(regexpInlineScriptsNotMaths, function (p0, p1) {
                return '<!-- ' + p1 + ' --!>';
            });
            // data = data.replace(/<script\b[^>]+tarteaucitron[^"']*?\.js(?:[^<]|<(?!\/script>))+<\/script>\s*/i, '');
        }

        // Remove shopping cart that attempts to post to server or scripts that take a very long time to fail and block page
        if (/passco/i.test(params.zimitPrefix)) {
            data = data.replace(/<script\b[^>]+(?:cart-fragments|lp-global\.min\.js)(?:[^<]|<(?!\/script>))+<\/script>\s*/, '');
        }
    } // End of html transformations

    /**
     * Transform css-style links in stylesheet files and stylesheet blocks in HTML
     */
    if (/\b(css|x?html)\b/i.test(mimetype)) {
        data = data.replace(regexpZimitCssLinks, function (match, url) {
            var newBlock = match;
            var assetUrl = url;
            // For root-relative links, we need to add the zimitPrefix
            assetUrl = assetUrl.replace(/^\/(?!\/)/, indexRoot + '/' + dirEntry.namespace + '/' + params.zimitPrefix + '/');
            // Deal with absolute URLs
            assetUrl = assetUrl.replace(/^(https?:)?\/\//i, indexRoot + '/' + dirEntry.namespace + '/' + (dirEntry.namespace === 'C' ? 'A/' : ''));
            if (rootDirectory) assetUrl = assetUrl.replace(/^(\.\.\/?)+/, indexRoot + '/' + dirEntry.namespace + '/' + params.zimitPrefix + '/');
            // Relative assets
            newBlock = assetUrl === url ? newBlock
                : newBlock.replace(url, '@kiwixtransformed@' + assetUrl);
            // console.debug('Transform: \n' + match + '\n -> ' + newBlock);
            return newBlock;
        });
    } // End of css transformations

    /**
     * Transform links in JavaScript files or script blocks in the html
     */
    if (/\b(javascript|x?html)\b/i.test(mimetype)) {
        data = data.replace(regexpZimitJavascriptLinks, function (match, url) {
            if (/www\.w3\.org\/XML\//i.test(url)) return match;
            var newBlock = match;
            var assetUrl = url;
            assetUrl = assetUrl.replace(/^\/(?!\/)/, indexRoot + '/' + dirEntry.namespace + '/' + params.zimitPrefix + '/');
            assetUrl = assetUrl.replace(/^\/\//, indexRoot + '/' + dirEntry.namespace + '/' + (dirEntry.namespace === 'C' ? 'A/' : ''));
            assetUrl = assetUrl.replace(/^https?:\/\//i, indexRoot + '/' + dirEntry.namespace + '/' + (dirEntry.namespace === 'C' ? 'A/' : ''));
            // Remove analytics
            assetUrl = /analytics|(typepad|api).*stats|googleads|doubleclick|syndication|jnn-pa\.googleapis|play\.google\.com/i.test(assetUrl) ? '' : assetUrl;
            // Relative assets
            newBlock = newBlock.replace(url, '@kiwixtransformed@' + assetUrl);
            // console.debug('Transform: \n' + match + '\n -> ' + newBlock);
            return newBlock;
        });
        data = data.replace(/(['"])(?:\/?)((?:static|api)\/)/ig, '$1' + indexRoot + '/' + dirEntry.namespace + '/' + params.zimitPrefix + '/$2');
    } // End of JavaScript transformations

    // Remove the placeholders used to prevent further matching
    data = data.replace(/@kiwixtransformed@(?!\.)/g, '');
    data = data.replace(/@kiwixtrans(?:formed)?@/g, '');

    return data;
}

/**
 * Transform video URL through fuzzy matching
 * Rules adapted from https://github.com/webrecorder/wabac.js/blob/main/src/fuzzymatcher.js
 * @param {String} url The URL to transform through fuzzy matching
 * @param {Function} callback The function to call with the transformed url
 */
function transformVideoUrl (url, articleDocument, callback) {
    if (/youtu(?:be(?:-nocookie)?\.com|\.be)/i.test(url)) {
        var cns = appstate.selectedArchive.getContentNamespace();
        var rgxTrimUrl = new RegExp('(?:[^/]|\\/(?!' + cns + '\\/))+\\/');
        var pureUrl = url.replace(rgxTrimUrl, '');
        // See https://webapps.stackexchange.com/questions/54443/format-for-id-of-youtube-video for explanation of format
        var videoId = pureUrl.match(/(?:videoid=|watch\?v=|embed\/|\/)([a-zA-Z0-9_-]{10}[048AEIMQUYcgkosw])(?:[&?#%]|\s*$)/i);
        videoId = videoId ? videoId[1] : null;
        if (!videoId) {
            callback(url);
            return
        };
        var prefix = (cns === 'C' ? cns + '/' : '') + 'H/www.youtube.com/ptracking';
        // Set up regular expression search of URL index (aka fuzzy search)
        var search = {
            rgxPrefix: new RegExp('.*' + videoId, 'i'),
            searchUrlIndex: true,
            size: 1
        }
        appstate.selectedArchive.findDirEntriesWithPrefixCaseSensitive(prefix, search, function (dirEntry) {
            if (dirEntry && dirEntry[0] && dirEntry[0].url) {
                dirEntry = dirEntry[0];
                var cpn = dirEntry.url.match(/cpn=([^&]+)/i);
                cpn = cpn ? cpn[1] : null;
                var ei = dirEntry.url.match(/ei=([^&]+)/i);
                ei = ei ? ei[1] : null;
                if (cpn || ei) {
                    prefix = (cns === 'C' ? cns + '/' : '') + 'A/rr';
                    search = {
                        rgxPrefix: new RegExp('.*' + (ei ? 'ei=' + ei : '') + (cpn ? '.*cpn=' + cpn : ''), 'i'),
                        searchUrlIndex: true,
                        size: 1
                    }
                    appstate.selectedArchive.findDirEntriesWithPrefixCaseSensitive(prefix, search, function (dirEntry) {
                        if (dirEntry && dirEntry[0] && dirEntry[0].url && !search.found) {
                            dirEntry = dirEntry[0];
                            search.found = true;
                            var transUrl = url.replace(pureUrl, dirEntry.namespace + '/' + dirEntry.url);
                            console.debug('TRANSFORMED VIDEO URL ' + pureUrl + ' --> \n' + transUrl);
                            // If we are dealing with embedded video, we have to find the embedded URL and subsitute it
                            if (/\/embed\//i.test(pureUrl)) {
                                var indexRoot = window.location.pathname.replace(/[^/]+$/, '') + encodeURI(appstate.selectedArchive.file.name);
                                Array.prototype.slice.call(articleDocument.querySelectorAll('iframe')).forEach(function (frame) {
                                    if (~frame.src.indexOf(videoId)) {
                                        var newUrl = window.location.origin + indexRoot + transUrl.replace(/videoembed/, '');
                                        frame.src = newUrl;
                                    }
                                });
                            }
                            callback(transUrl);
                        }
                    }, null); // null prevents callbacks with incomplete results
                } else {
                    callback(url);
                }
            } else {
                callback(url);
                if (/youtube\.com\/embed\//i.test(pureUrl)) {
                    var anchor = {
                        protocol: 'https:',
                        href: 'https://www.youtube.com/watch?v=' + videoId,
                        type: 'video'
                    }
                    uiUtil.warnAndOpenExternalLinkInNewTab(null, anchor, 'This video is not available offline in this ZIM. To view online, please open the following URL');
                }
            }
        }, null);
    } else {
        callback(url);
    }
}

export default {
    filterReplayFiles: filterReplayFiles,
    getZimitRedirect: getZimitRedirect,
    transformReplayUrls: transformReplayUrls,
    transformVideoUrl: transformVideoUrl
};