/**
* transformZimit.js: Functions to enable reading of Zimit ZIM format.
*
* Copyright 2022 Jaifroid, Mossroy and contributors.
* Licence: GPL v3.
*
* This file is part of Kiwix.
*
* Kiwix is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public Licence as published by
* the Free Software Foundation, either version 3 of the Licence, or
* (at your option) any later version.
*
* Kiwix is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public Licence for more details.
*
* You should have received a copy of the GNU General Public Licence
* along with Kiwix (file LICENSE-GPLv3.txt). If not, see .
*/
/**
* transformZimit.js: Functions to enable reading of Zimit ZIM format.
*
* Copyright 2022 Jaifroid, Mossroy and contributors.
* Licence: GPL v3.
*
* This file is part of Kiwix.
*
* Kiwix is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public Licence as published by
* the Free Software Foundation, either version 3 of the Licence, or
* (at your option) any later version.
*
* Kiwix is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public Licence for more details.
*
* You should have received a copy of the GNU General Public Licence
* along with Kiwix (file LICENSE-GPLv3.txt). If not, see .
*/
'use strict';
import uiUtil from './uiUtil.js';
/* global appstate, params */
/**
* Filters out the Replay system files (since these cannot be loaded alongside a Service Worker without error)
* In the case of the H prefix or the landing page, an 'inspect' property is added to the dirEntry, so that we can discover
* the underlying Zimit landing page below
* @param {dirEntry} dirEntry The directory entry to modify or anull
* @returns {dirEntry} The modified directory entry
*/
function filterReplayFiles (dirEntry) {
if (!(dirEntry && dirEntry.url)) return null;
if (dirEntry.namespace === 'H' || dirEntry.namespace === 'C' && /^H\//.test(dirEntry.url) ||
params.isLandingPage && /^(A\/)?index\.html(?:[?#]|$)/.test(dirEntry.url)) {
dirEntry.inspect = true;
}
if (/(?:\bload\.js|\bsw\.js|analytics.*\.js|update\.googleapis|play\.google.*(?:stats|logs)|youtube\.com.*\/stats|google\.internal|syndication|survey\.js|yuiloader\.js|doubleclick|play\.google\.|developer\.mozilla\.org\/static\/js\/main\..+\.js|webpushr.*\.com.*\.js)(?:[?#]|$)/i.test(dirEntry.url)) {
dirEntry.nullify = true;
}
return dirEntry;
}
/**
* Inspects the HTML of the ZIM archive's landing page or of the requested Header to discover the URL to redirect to
* adds a custom redirect to the dirEntry
* @param {dirEntry} dirEntry The directory entry of the landing page or H-prefixed header to process
* @param {String} data The decoded data which the dirEntry points to
* @param {String} cns The Content Name Space of the ZIM (usually 'C' or 'A')
* @returns {dirEntry} The modified directory entry
*/
function getZimitRedirect (dirEntry, data, cns) {
var redirect;
// Type 1 ZIMs don't use the H namespace, and instead use H as prefix to the URL
if (dirEntry.namespace === 'H' || cns === 'C' && /^H\//.test(dirEntry.url)) {
// We are dealing with a Header redirect, so we need to find the Location: field
redirect = data.match(/^Location:\s*https?:\/\/([^/]+)(.*)$/m);
if (!redirect) redirect = data.match(/^WARC-Target-URI:\s*https?:\/\/([^/]+)(.*)$/m)
if (redirect && redirect[1]) {
// Type 1 Zimit ZIMs need intermediary 'A' prefix, since there is no longer any A namespace
params.zimitPrefix = (cns === 'C' ? 'A/' : '') + redirect[1];
dirEntry.zimitRedirect = cns + '/' + params.zimitPrefix + redirect[2];
} else {
dirEntry.zimitRedirect = null;
}
} else if (/301\s*moved\s+permanently/i.test(data)) {
redirect = data.match(/moved\s+permanently(?:[^<]|<(?!a\s))+]*?[\s;])(?:src\b|href|url)\s*(=\s*(["']))(?=[./]+|https?)((?:[^>](?!\3|\?|#))+[^>])([^>]*>)/ig;
var regexpZimitJavascriptLinks = /['"(]((?=[^'"?#)]+\.(?:com?\b|net\b|org\b))|(?:(?:https?:)?\/\/)[^'"?#)]*)['"?#)]/ig;
var regexpZimitCssLinks = /\burl\s*\(['"\s]*([^)'"\s]+)['"\s]*\)/ig;
var regexpGetZimitPrefix = /link\s+rel=["']canonical["']\s+href="https?:\/\/([^/"]+)/i;
var regexpRemoveAnalytics1 = /\n');
// Get stem for constructing an absolute URL
data = data.replace(regexpZimitHtmlLinks, function (match, blockStart, equals, quote, relAssetUrl, blockClose) {
var newBlock = match;
var assetUrl = relAssetUrl;
// console.log('Asset URL: ' + assetUrl);
// Remove google analytics and other analytics files that cause stall
if (/analytics|typepad.*stats|googleads|doubleclick|syndication/i.test(assetUrl)) return '';
// For root-relative links, we need to add the zimitPrefix
assetUrl = assetUrl.replace(/^\/(?!\/)/, indexRoot + '/' + dirEntry.namespace + '/' + params.zimitPrefix + '/');
// For Zimit assets that begin with https: or // the zimitPrefix is derived from the URL
assetUrl = assetUrl.replace(/^(?:https?:)?\/\//i, indexRoot + '/' + dirEntry.namespace + '/' + (dirEntry.namespace === 'C' ? 'A/' : ''));
// For fully relative links, we have to remove any '..' if we are in root directory
if (rootDirectory) assetUrl = assetUrl.replace(/^(\.\.\/?)+/, indexRoot + '/' + dirEntry.namespace + '/' + params.zimitPrefix + '/');
// Add placeholder to prevent further transformations
if (/^ ' + newBlock);
return newBlock;
});
// Deal with image srcsets
data = data.replace(/
]+srcset=["']([^"']+)/ig, function (match, srcset) {
var srcsetArr = srcset.split(',');
var swPrefix = params.contentInjectionMode === 'serviceworker' ? indexRoot + '/' : '';
for (var i = 0; i < srcsetArr.length; i++) {
// For root-relative links, we need to add the zimitPrefix
srcsetArr[i] = srcsetArr[i].replace(/^\s*\/(?!\/)/, swPrefix + dirEntry.namespace + '/' + params.zimitPrefix + '/');
// Zimit prefix is in the URL for absolute URLs
srcsetArr[i] = srcsetArr[i].replace(/^(?:\s*https?:)?\/\//i, swPrefix + dirEntry.namespace + '/' + (dirEntry.namespace === 'C' ? 'A/' : ''));
if (rootDirectory) srcsetArr[i] = srcsetArr[i].replace(/^(\.\.\/?)+/, swPrefix + dirEntry.namespace + '/' + params.zimitPrefix + '/');
srcsetArr[i] = '@kiwixtransformed@' + srcsetArr[i];
}
match = match.replace(srcset, srcsetArr.join(', '));
return match;
});
// Deal with regex-style urls embedded in page
data = data.replace(/https?:\\\/\\\/[^"']+/gi, function (assetUrl) {
assetUrl = assetUrl.replace(/^https?:\\\/\\\//i, '\\/' + dirEntry.namespace + '\\/' + (dirEntry.namespace === 'C' ? 'A\\/' : ''));
assetUrl = (indexRoot).replace(/\\/g, '\\\\').replace(/\//g, '\\/') + assetUrl;
return assetUrl;
});
// Remove any statements
// DEV: You should probably deal with this more intelligently, changing absolute links rather than just removing,
// but so far, removing it seems to do the job
data = data.replace(/]+href\b[^>]+>\s*/i, '');
// Remove wordpress link tracker
data = data.replace(/