Backport changes from Kiwix JS

This commit is contained in:
Jaifroid 2023-10-17 13:56:59 +01:00
parent 8d50d1714c
commit d839f29a7d
6 changed files with 129 additions and 114 deletions

View File

@ -1031,7 +1031,7 @@ document.getElementById('btnConfigure').addEventListener('click', function () {
setTab();
if (params.themeChanged) {
params.themeChanged = false;
var archiveName = appstate.selectedArchive ? appstate.selectedArchive._file.name : null;
var archiveName = appstate.selectedArchive ? appstate.selectedArchive.file.name : null;
if (archiveName && ~params.lastPageVisit.indexOf(archiveName)) {
goToArticle(params.lastPageVisit.replace(/@kiwixKey@.+$/, ''));
}
@ -1775,7 +1775,7 @@ document.getElementById('manipulateImagesCheck').addEventListener('click', funct
} else if (window.nw) {
uiUtil.systemAlert('Unfortunately there is currently no way to save an image to disk in the NWJS version of this app.<br>You can do this in the PWA version: please visit https://pwa.kiwix.org.');
} else if (params.contentInjectionMode === 'serviceworker' && appstate.selectedArchive &&
!/wikipedia|wikivoyage|mdwiki|wiktionary/i.test(appstate.selectedArchive._file.name)) {
!/wikipedia|wikivoyage|mdwiki|wiktionary/i.test(appstate.selectedArchive.file.name)) {
uiUtil.systemAlert('Please be aware that Image manipulation can interfere with non-Wikimedia ZIMs (particularly ZIMs that have active content). If you cannot access the articles in such a ZIM, please turn this setting off.');
} else if (/PWA/.test(params.appType) && params.contentInjectionMode === 'jquery') {
uiUtil.systemAlert('Be aware that this option may interfere with active content if you switch to Service Worker mode.');
@ -3381,7 +3381,7 @@ function setLocalArchiveFromArchiveList (archive) {
readNodeDirectoryAndCreateNodeFileObjects(params.pickedFolder, archive)
.then(function (fileset) {
var selectedFiles = fileset[0];
if (appstate.selectedArchive && appstate.selectedArchive._file._files[0].name === selectedFiles[0].name) {
if (appstate.selectedArchive && appstate.selectedArchive.file._files[0].name === selectedFiles[0].name) {
document.getElementById('btnHome').click();
} else {
setLocalArchiveFromFileList(selectedFiles);
@ -3947,18 +3947,18 @@ function archiveReadyCallback (archive) {
// Ensure that the new ZIM output is initially sent to the iframe (e.g. if the last article was loaded in a window)
// (this only affects jQuery mode)
appstate.target = 'iframe';
appstate.wikimediaZimLoaded = /wikipedia|wikivoyage|mdwiki|wiktionary/i.test(archive._file.name);
appstate.wikimediaZimLoaded = /wikipedia|wikivoyage|mdwiki|wiktionary/i.test(archive.file.name);
appstate.pureMode = false;
// These ZIM types have so much dynamic content that we have to allow all images
if (params.imageDisplay && (/gutenberg|phet/i.test(archive._file.name) ||
if (params.imageDisplay && (/gutenberg|phet/i.test(archive.file.name) ||
// params.isLandingPage ||
/kolibri/i.test(archive._file.creator) ||
/kolibri/i.test(archive.creator) ||
params.zimType === 'zimit')) {
params.imageDisplayMode = 'all';
if (params.zimType !== 'zimit') {
// For some archive types (Gutenberg, PhET, Kolibri at least), we have to get out of the way and allow the Service Worker
// to act as a transparent passthrough (this key will be read in the handleMessageChannelMessage function)
console.debug('*** Activating pureMode for ZIM: ' + archive._file.name + ' ***');
console.debug('*** Activating pureMode for ZIM: ' + archive.file.name + ' ***');
appstate.pureMode = true;
}
}
@ -3992,8 +3992,8 @@ function archiveReadyCallback (archive) {
}
}
// The archive is set : go back to home page to start searching
params.storedFile = archive._file._files[0].name;
params.storedFilePath = archive._file._files[0].path ? archive._file._files[0].path : '';
params.storedFile = archive.file._files[0].name;
params.storedFilePath = archive.file._files[0].path ? archive.file._files[0].path : '';
settingsStore.setItem('lastSelectedArchive', params.storedFile, Infinity);
settingsStore.setItem('lastSelectedArchivePath', params.storedFilePath, Infinity);
if (!~params.lastPageVisit.indexOf(params.storedFile.replace(/\.zim(\w\w)?$/, ''))) {
@ -4648,7 +4648,7 @@ function readArticle (dirEntry) {
uiUtil.clearSpinner();
});
} else if (params.contentInjectionMode === 'serviceworker') {
articleContainer = window.open('../' + appstate.selectedArchive._file.name + '/' + dirEntry.namespace + '/' + encodeURIComponent(dirEntry.url),
articleContainer = window.open('../' + appstate.selectedArchive.file.name + '/' + dirEntry.namespace + '/' + encodeURIComponent(dirEntry.url),
params.windowOpener === 'tab' ? '_blank' : encodeURIComponent(dirEntry.title | mimeType),
params.windowOpener === 'window' ? 'toolbar=0,location=0,menubar=0,width=800,height=600,resizable=1,scrollbars=1' : null);
appstate.target = 'window';
@ -4660,7 +4660,7 @@ function readArticle (dirEntry) {
}
// Load cached start page if it exists and we have loaded the packaged file
var htmlContent = 0;
var zimName = appstate.selectedArchive._file.name.replace(/\.[^.]+$/, '').replace(/_\d+-\d+$/, '');
var zimName = appstate.selectedArchive.file.name.replace(/\.[^.]+$/, '').replace(/_\d+-\d+$/, '');
if (params.isLandingPage && params.cachedStartPages[zimName]) {
htmlContent = -1;
// @TODO: Why are we double-encoding here????? Clearly we double-decode somewhere...
@ -4777,7 +4777,7 @@ var loaded = false;
var articleLoadedSW = function (dirEntry) {
if (loaded) return;
loaded = true;
params.lastPageVisit = dirEntry.namespace + '/' + dirEntry.url + '@kiwixKey@' + appstate.selectedArchive._file.name;
params.lastPageVisit = dirEntry.namespace + '/' + dirEntry.url + '@kiwixKey@' + appstate.selectedArchive.file.name;
articleDocument = articleWindow.document.documentElement;
var doc = articleWindow.document;
var docBody = doc.body;
@ -4812,7 +4812,7 @@ var articleLoadedSW = function (dirEntry) {
listenForNavigationKeys();
// We need to keep tabs on the opened tabs or windows if the user wants right-click functionality, and also parse download links
// We need to set a timeout so that dynamically generated URLs are parsed as well (e.g. in Gutenberg ZIMs)
if (params.windowOpener) {
if (params.windowOpener && !appstate.pureMode) {
setTimeout(function () {
parseAnchorsJQuery(dirEntry);
}, 1500);
@ -4973,7 +4973,7 @@ function handleMessageChannelMessage (event) {
} else {
loadingArticle = '';
}
var cacheKey = appstate.selectedArchive._file.name + '/' + title;
var cacheKey = appstate.selectedArchive.file.name + '/' + title;
cache.getItemFromCacheOrZIM(appstate.selectedArchive, cacheKey, dirEntry).then(function (content) {
console.debug('SW read binary file for: ' + dirEntry.namespace + '/' + dirEntry.url);
if (params.zimType === 'zimit' && loadingArticle) {
@ -5051,7 +5051,7 @@ function postTransformedHTML (thisMessage, thisMessagePort, thisDirEntry) {
if (/UWP/.test(params.appType) && (appstate.target === 'window' || appstate.messageChannelWaiting) &&
params.imageDisplay) { thisMessage.imageDisplay = 'all'; }
// We need to do the same for Gutenberg and PHET ZIMs
if (params.imageDisplay && (/gutenberg|phet/i.test(appstate.selectedArchive._file.name)
if (params.imageDisplay && (/gutenberg|phet/i.test(appstate.selectedArchive.file.name)
// || params.isLandingPage
)) {
thisMessage.imageDisplay = 'all';
@ -5203,21 +5203,21 @@ function displayArticleContentInContainer (dirEntry, htmlArticle) {
// Since page has been successfully loaded, store it in the browser history
if (params.contentInjectionMode === 'jquery') pushBrowserHistoryState(dirEntry.namespace + '/' + dirEntry.url);
// Store for fast retrieval
params.lastPageVisit = dirEntry.namespace + '/' + dirEntry.url + '@kiwixKey@' + appstate.selectedArchive._file.name;
params.lastPageVisit = dirEntry.namespace + '/' + dirEntry.url + '@kiwixKey@' + appstate.selectedArchive.file.name;
if (params.rememberLastPage) settingsStore.setItem('lastPageVisit', params.lastPageVisit, Infinity);
cache.setArticle(appstate.selectedArchive._file.name, dirEntry.namespace + '/' + dirEntry.url, htmlArticle, function () {});
cache.setArticle(appstate.selectedArchive.file.name, dirEntry.namespace + '/' + dirEntry.url, htmlArticle, function () {});
params.htmlArticle = htmlArticle;
// Replaces ZIM-style URLs of img, script, link and media tags with a data-kiwixurl to prevent 404 errors [kiwix-js #272 #376]
// This replacement also processes the URL relative to the page's ZIM URL so that we can find the ZIM URL of the asset
// with the correct namespace (this works for old-style -,I,J namespaces and for new-style C namespace)
if (params.linkToWikimediaImageFile && !params.isLandingPage && /(?:wikipedia|wikivoyage|wiktionary|mdwiki)_/i.test(appstate.selectedArchive._file.name)) {
var wikiLang = appstate.selectedArchive._file.name.replace(/(?:wikipedia|wikivoyage|wiktionary|mdwiki)_([^_]+).+/i, '$1');
var wikimediaZimFlavour = appstate.selectedArchive._file.name.replace(/_.+/, '');
if (params.linkToWikimediaImageFile && !params.isLandingPage && /(?:wikipedia|wikivoyage|wiktionary|mdwiki)_/i.test(appstate.selectedArchive.file.name)) {
var wikiLang = appstate.selectedArchive.file.name.replace(/(?:wikipedia|wikivoyage|wiktionary|mdwiki)_([^_]+).+/i, '$1');
var wikimediaZimFlavour = appstate.selectedArchive.file.name.replace(/_.+/, '');
}
var newBlock;
var assetZIMUrlEnc;
var indexRoot = window.location.pathname.replace(/[^/]+$/, '') + encodeURI(appstate.selectedArchive._file.name) + '/';
var indexRoot = window.location.pathname.replace(/[^/]+$/, '') + encodeURI(appstate.selectedArchive.file.name) + '/';
if (params.contentInjectionMode == 'jquery') {
htmlArticle = htmlArticle.replace(params.regexpTagsWithZimUrl, function (match, blockStart, equals, quote, relAssetUrl, blockClose) {
// Don't process data URIs (yet)
@ -5284,7 +5284,7 @@ function displayArticleContentInContainer (dirEntry, htmlArticle) {
// @TODO - remove when fixed on mw-offliner: dirty patch for removing extraneous tags in ids
htmlArticle = htmlArticle.replace(/(\bid\s*=\s*"[^\s}]+)\s*\}[^"]*/g, '$1');
// @TODO - remove when fixed in MDwiki ZIM: dirty patch for removing erroneously hard-coded style
if (/^mdwiki/.test(appstate.selectedArchive._file.name)) htmlArticle = htmlArticle.replace(/(class=['"]thumbinner[^>]+style=['"]width\s*:\s*)\d+px/ig, '$1320px');
if (/^mdwiki/.test(appstate.selectedArchive.file.name)) htmlArticle = htmlArticle.replace(/(class=['"]thumbinner[^>]+style=['"]width\s*:\s*)\d+px/ig, '$1320px');
// Remove landing page scripts that don't work in SW mode
htmlArticle = htmlArticle.replace(/<script\b[^>]+-\/[^>]*((?:images_loaded|masonry)\.min|article_list_home)\.js"[^<]*<\/script>/gi, '');
// Set max-width for infoboxes (now set in -/s/styles.css)
@ -5399,7 +5399,7 @@ function displayArticleContentInContainer (dirEntry, htmlArticle) {
? false : params.useMathJax;
// Detect raw MathML on page for certain ZIMs that are expected to have it
params.containsMathTexRaw = params.useMathJax &&
/stackexchange|askubuntu|superuser|stackoverflow|mathoverflow|serverfault|stackapps|proofwiki/i.test(appstate.selectedArchive._file.name)
/stackexchange|askubuntu|superuser|stackoverflow|mathoverflow|serverfault|stackapps|proofwiki/i.test(appstate.selectedArchive.file.name)
? /[^\\](\$\$?)((?:\\\$|(?!\1)[\s\S])+)\1/.test(htmlArticle) : false;
// if (params.containsMathTexRaw) {
@ -5572,7 +5572,7 @@ function displayArticleContentInContainer (dirEntry, htmlArticle) {
blobArray.push([title, cssBlobCache.get(title)]);
injectCSS();
} else {
var cacheKey = appstate.selectedArchive._file.name + '/' + title;
var cacheKey = appstate.selectedArchive.file.name + '/' + title;
cache.getItemFromCacheOrZIM(appstate.selectedArchive, cacheKey).then(function (content) {
// DEV: Uncomment line below and break on next to capture cssContent for local filesystem cache
// var cssContent = util.uintToString(content);
@ -5884,7 +5884,7 @@ function displayArticleContentInContainer (dirEntry, htmlArticle) {
// If the request was not initiated by an existing controlled window, we instantiate the request here
if (!appstate.messageChannelWaiting) {
// We put the ZIM filename as a prefix in the URL, so that browser caches are separate for each ZIM file
var newLocation = '../' + appstate.selectedArchive._file.name + '/' + dirEntry.namespace + '/' + encodedUrl;
var newLocation = '../' + appstate.selectedArchive.file.name + '/' + dirEntry.namespace + '/' + encodedUrl;
if (navigator.serviceWorker.controller) {
loaded = false;
articleWindow.location.href = newLocation;
@ -6053,7 +6053,7 @@ function addListenersToLink (a, href, baseUrl) {
e.stopPropagation();
anchorParameter = href.match(/#([^#;]+)$/);
anchorParameter = anchorParameter ? anchorParameter[1] : '';
var indexRoot = window.location.pathname.replace(/[^/]+$/, '') + encodeURI(appstate.selectedArchive._file.name) + '/';
var indexRoot = window.location.pathname.replace(/[^/]+$/, '') + encodeURI(appstate.selectedArchive.file.name) + '/';
var zimRoot = indexRoot.replace(/^.+?\/www\//, '/');
var zimUrl;
var zimUrlFullEncoding;
@ -6398,7 +6398,7 @@ function goToArticle (path, download, contentType, pathEnc) {
clearFindInArticle();
var shortTitle = path.replace(/[^/]+\//g, '').substring(0, 18);
uiUtil.pollSpinner('Loading ' + shortTitle);
var zimName = appstate.selectedArchive._file.name.replace(/\.[^.]+$/, '').replace(/_\d+-\d+$/, '');
var zimName = appstate.selectedArchive.file.name.replace(/\.[^.]+$/, '').replace(/_\d+-\d+$/, '');
if (~path.indexOf(params.cachedStartPages[zimName])) {
goToMainArticle();
return;
@ -6426,7 +6426,7 @@ function goToArticle (path, download, contentType, pathEnc) {
} else if (download || /\/(epub|pdf|zip|.*opendocument|.*officedocument|tiff|mp4|webm|mpeg|octet-stream)\b/i.test(mimetype)) {
// PDFs can be treated as a special case, as they can be displayed directly in a browser window or tab in most browsers (but not UWP)
if (!/UWP/.test(params.appType) && params.contentInjectionMode === 'serviceworker' && (/\/pdf\b/.test(mimetype) || /\.pdf([?#]|$)/i.test(dirEntry.url))) {
window.open(document.location.pathname.replace(/[^/]+$/, '') + appstate.selectedArchive._file.name + '/' + pathForServiceWorker,
window.open(document.location.pathname.replace(/[^/]+$/, '') + appstate.selectedArchive.file.name + '/' + pathForServiceWorker,
params.windowOpener === 'tab' ? '_blank' : 'Download PDF',
params.windowOpener === 'window' ? 'toolbar=0,location=0,menubar=0,width=800,height=600,resizable=1,scrollbars=1' : null);
} else {
@ -6462,7 +6462,7 @@ function goToRandomArticle () {
// We fall back to the old A namespace to support old ZIM files without a text/html MIME type for articles
// DEV: If minorVersion is 1, then we are using a v1 article-only title listing. By definition,
// all dirEntries in an article-only listing must be articles.
if (appstate.selectedArchive._file.minorVersion === 1 || /text\/html\b/i.test(dirEntry.getMimetype()) ||
if (appstate.selectedArchive.file.minorVersion === 1 || /text\/html\b/i.test(dirEntry.getMimetype()) ||
params.zimType !== 'zimit' && dirEntry.namespace === 'A') {
params.isLandingPage = false;
alertBoxHeader.style.display = 'none';

View File

@ -105,7 +105,7 @@ function extractImages (images, callback) {
return;
}
// Zimit files (at least) will sometimes have a ZIM prefix, but we are extracting raw here
title = title.replace(appstate.selectedArchive._file.name + '/', '');
title = title.replace(appstate.selectedArchive.file.name + '/', '');
// Zimit files store URLs encoded!
if (params.zimType === 'zimit') title = encodeURI(title);
appstate.selectedArchive.getDirEntryByPath(title).then(function (dirEntry) {
@ -242,7 +242,7 @@ function prepareImagesServiceWorker (win, forPrinting) {
}, 1000);
if (!forPrinting && !documentImages.length) return;
var imageHtml;
var indexRoot = window.location.pathname.replace(/[^/]+$/, '') + encodeURI(appstate.selectedArchive._file.name) + '/';
var indexRoot = window.location.pathname.replace(/[^/]+$/, '') + encodeURI(appstate.selectedArchive.file.name) + '/';
for (var i = 0, l = documentImages.length; i < l; i++) {
// Process Wikimedia MathML, but not if we'll be using the jQuery routine later
if (!(params.manipulateImages || params.allowHTMLExtraction)) {
@ -312,7 +312,7 @@ function prepareImagesJQuery (win, forPrinting) {
container = win;
var doc = container.document;
var documentImages = doc.querySelectorAll('img[data-kiwixurl], video, audio');
var indexRoot = window.location.pathname.replace(/[^/]+$/, '') + encodeURI(appstate.selectedArchive._file.name) + '/';
var indexRoot = window.location.pathname.replace(/[^/]+$/, '') + encodeURI(appstate.selectedArchive.file.name) + '/';
indexRoot = indexRoot.replace(/^\//, '');
// Zimit ZIMs work better if all images are extracted
if (params.zimType === 'zimit') forPrinting = true;
@ -331,7 +331,7 @@ function prepareImagesJQuery (win, forPrinting) {
image.style.opacity = '0';
// Set a minimum width to avoid some images not rendering in squashed hidden tables
if (params.displayHiddenBlockElements && image.width && !image.style.minWidth &&
/wiki|wiktionary/i.test(appstate.selectedArchive._file.name)) {
/wiki|wiktionary/i.test(appstate.selectedArchive.file.name)) {
var imgX = image.width + '';
imgX = imgX.replace(/(\d+)$/, '$1px');
image.style.minWidth = imgX;

View File

@ -131,7 +131,7 @@ function transformReplayUrls (dirEntry, data, mimetype, callback) {
* Note that some Zimit ZIMs have mimeteypes like 'text/html;raw=true', so we can't simply match 'text/html'
* Other ZIMs have a mimetype like 'html' (with no 'text/'), so we have to match as generically as possible
*/
var indexRoot = window.location.pathname.replace(/[^/]+$/, '') + encodeURI(appstate.selectedArchive._file.name);
var indexRoot = window.location.pathname.replace(/[^/]+$/, '') + encodeURI(appstate.selectedArchive.file.name);
if (/\bx?html\b/i.test(mimetype)) {
var zimitPrefix = data.match(regexpGetZimitPrefix);
// If the URL is the same as the URL with everything after the first / removed, then we are in the root directory
@ -320,7 +320,7 @@ function transformVideoUrl (url, articleDocument, callback) {
console.debug('TRANSFORMED VIDEO URL ' + pureUrl + ' --> \n' + transUrl);
// If we are dealing with embedded video, we have to find the embedded URL and subsitute it
if (/\/embed\//i.test(pureUrl)) {
var indexRoot = window.location.pathname.replace(/[^/]+$/, '') + encodeURI(appstate.selectedArchive._file.name);
var indexRoot = window.location.pathname.replace(/[^/]+$/, '') + encodeURI(appstate.selectedArchive.file.name);
Array.prototype.slice.call(articleDocument.querySelectorAll('iframe')).forEach(function (frame) {
if (~frame.src.indexOf(videoId)) {
var newUrl = window.location.origin + indexRoot + transUrl.replace(/videoembed/, '');

View File

@ -399,7 +399,7 @@ function displayActiveContentWarning (type) {
(params.contentInjectionMode === 'jquery' ? '<b>Limited Zimit support!</b> Please <a id="swModeLink" href="#contentInjectionModeDiv" ' +
'class="alert-link">switch to Service Worker mode</a> if your platform supports it. '
: 'Support for <b>Zimit</b> archives is experimental. Some content (e.g. audio/video) may fail. ') +
'You can search for content above' + (appstate.selectedArchive._file.fullTextIndex ? ' using full-text search if your app supports it, ' +
'You can search for content above' + (appstate.selectedArchive.file.fullTextIndex ? ' using full-text search if your app supports it, ' +
'or s' : '. S') + 'tart your search with <b>.*</b> to match part of a title. Type a <b><i>space</i></b> for the ZIM Archive Index, or ' +
'<b><i>space / </i></b> for the URL Index.&nbsp;[<a id="stop" href="#expertSettingsDiv" class="alert-link">Permanently hide</a>]' +
'</div>';

View File

@ -1,22 +1,22 @@
/**
* zimArchive.js: Support for archives in ZIM format.
*
* Copyright 2015 Mossroy and contributors
* License GPL v3:
* Copyright 2015-2023 Mossroy, Jaifroid and contributors
* Licence GPL v3:
*
* This file is part of Kiwix.
*
* Kiwix is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* it under the terms of the GNU General Public Licence as published by
* the Free Software Foundation, either version 3 of the Licence, or
* (at your option) any later version.
*
* Kiwix is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* GNU General Public Licence for more details.
*
* You should have received a copy of the GNU General Public License
* You should have received a copy of the GNU General Public Licence
* along with Kiwix (file LICENSE-GPLv3.txt). If not, see <http://www.gnu.org/licenses/>
*/
@ -34,10 +34,17 @@ import utf8 from './utf8.js';
/**
* ZIM Archive
*
*
* @typedef ZIMArchive
* @property {ZIMFile} _file The ZIM file (instance of ZIMFile, that might physically be split into several actual files)
* @property {String} _language Language of the content
* @property {ZIMFile} file The ZIM file (instance of ZIMFile, that might physically be split into several actual _files)
* @property {String} counter Counter of various types of content in the archive
* @property {String} creator Creator of the content
* @property {String} date Date of the creation of the archive
* @property {String} description Description of the content
* @property {String} language Language of the content
* @property {String} name Name of the archive
* @property {String} publisher Publisher of the content
* @property {String} title Title of the content
* @property {String} zimType Extended property: currently either 'open' for OpenZIM file type, or 'zimit' for the warc2zim file type used by Zimit
*/
/**
@ -66,17 +73,16 @@ var LZ;
*/
function ZIMArchive (storage, path, callbackReady, callbackError) {
var that = this;
that._file = null;
that._language = ''; // @TODO
that.file = null;
var createZimfile = function (fileArray) {
zimfile.fromFileArray(fileArray).then(function (file) {
that._file = file;
that.file = file;
// Clear the previous libzimWoker
LZ = null;
// Set a global parameter to report the search provider type
params.searchProvider = 'title';
// File has been created, but we need to add any Listings which extend the archive metadata
that._file.setListings([
that.file.setListings([
// Provide here any Listings for which we need to extract metadata as key:value obects to be added to the file
// 'ptrName' and 'countName' contain the key names to be set in the archive file object
{
@ -100,16 +106,16 @@ function ZIMArchive (storage, path, callbackReady, callbackError) {
}
]).then(function () {
// There is currently an exception thrown in the libzim wasm if we attempt to load a split ZIM archive, so we work around
var isSplitZim = /\.zima.$/i.test(that._file._files[0].name);
var isSplitZim = /\.zima.$/i.test(that.file._files[0].name);
var libzimReaderType = params.debugLibzimASM || ('WebAssembly' in self ? 'wasm' : 'asm');
if (that._file.fullTextIndex && params.debugLibzimASM !== 'disable' && (params.debugLibzimASM || !isSplitZim &&
if (that.file.fullTextIndex && params.debugLibzimASM !== 'disable' && (params.debugLibzimASM || !isSplitZim &&
// The ASM implementation requires Atomics support, whereas the WASM implementation does not
(typeof Atomics !== 'undefined' || libzimReaderType === 'wasm') &&
// Note that NWJS currently throws due to problems with Web Worker context, and Android is very slow unless we use OPFS
!(/Android/.test(params.appType) && !params.useOPFS) && !(window.nw && that._file._files[0].readMode === 'electron'))) {
!(/Android/.test(params.appType) && !params.useOPFS) && !(window.nw && that.file._files[0].readMode === 'electron'))) {
console.log('Instantiating libzim ' + libzimReaderType + ' Web Worker...');
LZ = new Worker('js/lib/libzim-' + libzimReaderType + '.js');
that.callLibzimWorker({ action: 'init', files: that._file._files }).then(function (msg) {
that.callLibzimWorker({ action: 'init', files: that.file._files }).then(function (msg) {
// console.debug(msg);
params.searchProvider = 'fulltext: ' + libzimReaderType;
// Update the API panel
@ -120,7 +126,7 @@ function ZIMArchive (storage, path, callbackReady, callbackError) {
});
} else {
// var message = 'Full text searching is not available because ';
if (!that._file.fullTextIndex) {
if (!that.file.fullTextIndex) {
params.searchProvider += ': no_fulltext'; // message += 'this ZIM does not have a full-text index.';
} else if (isSplitZim) {
params.searchProvider += ': split_zim'; // message += 'the ZIM archive is split.';
@ -134,24 +140,35 @@ function ZIMArchive (storage, path, callbackReady, callbackError) {
params.searchProvider += ': unknown';
}
uiUtil.reportSearchProviderToAPIStatusPanel(params.searchProvider);
// uiUtil.systemAlert(message);
}
// Set the archive file type ('open' or 'zimit')
params.zimType = that.setZimType();
// var thisCallbackReady = callbackReady;
// Add any metadata from the M/ namespace that you need access to here
// Add time-critical metadata from the M/ namespace that you need early access to here
// Note that adding metadata here delays the reporting of the ZIM archive as ready
// Further metadata are added in the background below, and can be accessed later
Promise.all([
that.addMetadataToZIMFile('Creator'),
that.addMetadataToZIMFile('Name')
that.addMetadataToZIMFile('Language')
]).then(function () {
// If the arhchive name doesn't end in `.zim`, we add it to the metadata
that._file.name = that._file.name.replace(/\.zim\s*$/i, '') + '.zim';
console.debug('ZIMArchive ready, metadata will be added in the background');
// All listings should be loaded, so we can now call the callback
callbackReady(that);
});
// DEV: Currently, extended listings are only used for title (=article) listings when the user searches
// for an article or uses the Random button, by which time the listings will have been extracted.
// If, in the future, listings are used in a more time-critical manner, consider forcing a wait before
// declaring the archive to be ready, by chaining the following callback in a .then() function of setListings.
// Add non-time-critical metadata to archive in background so as not to delay opening of the archive
// DEV: Note that it does not make sense to extract illustration (icon) metadata here. Instead, if you implement use of the illustration
// metadata as icons for the loaded ZIM [kiwix-js #886], you should simply use the ZIMArdhive.getMetadata() function when needed
setTimeout(function () {
Promise.all([
that.addMetadataToZIMFile('Counter'),
that.addMetadataToZIMFile('Date'),
that.addMetadataToZIMFile('Description'),
that.addMetadataToZIMFile('Name'),
that.addMetadataToZIMFile('Publisher'),
that.addMetadataToZIMFile('Title')
]).then(function () {
console.debug('ZIMArchive metadata loaded:', that);
});
}, 1500);
}).catch(function (err) {
console.warn('Error setting archive listings: ', err);
});
@ -208,27 +225,27 @@ ZIMArchive.prototype._searchArchiveParts = function (storage, prefixPath) {
* @returns {Boolean}
*/
ZIMArchive.prototype.isReady = function () {
return this._file !== null;
return this.file !== null;
};
/**
* Detects whether the supplied archive is a Zimit-style archive or an OpenZIM archive and
* sets a _file.zimType property accordingly; also returns the detected type. Extends ZIMFile.
* sets a zimType property accordingly; also returns the detected type. Extends ZIMArchive.
* @returns {String} Either 'zimit' for a Zimit archive, or 'open' for an OpenZIM archive
*/
ZIMArchive.prototype.setZimType = function () {
var fileType = null;
var archiveType = null;
if (this.isReady()) {
fileType = 'open';
this._file.mimeTypes.forEach(function (v) {
if (/warc-headers/i.test(v)) fileType = 'zimit';
archiveType = 'open';
this.file.mimeTypes.forEach(function (v) {
if (/warc-headers/i.test(v)) archiveType = 'zimit';
});
this._file.zimType = fileType;
console.debug('Archive type set to: ' + fileType);
this.zimType = archiveType;
console.debug('Archive type set to: ' + archiveType);
} else {
console.error('ZIMArchive is not ready! Cannot set ZIM type.');
}
return fileType;
return archiveType;
};
/**
@ -238,11 +255,11 @@ ZIMArchive.prototype.setZimType = function () {
*/
ZIMArchive.prototype.getMainPageDirEntry = function (callback) {
if (this.isReady()) {
var mainPageUrlIndex = this._file.mainPage;
var mainPageUrlIndex = this.file.mainPage;
var that = this;
this._file.dirEntryByUrlIndex(mainPageUrlIndex).then(function (dirEntry) {
this.file.dirEntryByUrlIndex(mainPageUrlIndex).then(function (dirEntry) {
// Filter out Zimit files that we cannot handle without error
if (that._file.zimType === 'zimit') dirEntry = transformZimit.filterReplayFiles(dirEntry);
if (that.zimType === 'zimit') dirEntry = transformZimit.filterReplayFiles(dirEntry);
callback(dirEntry);
});
}
@ -254,7 +271,7 @@ ZIMArchive.prototype.getMainPageDirEntry = function (callback) {
* @returns {DirEntry}
*/
ZIMArchive.prototype.parseDirEntryId = function (dirEntryId) {
return zimDirEntry.DirEntry.fromStringId(this._file, dirEntryId);
return zimDirEntry.DirEntry.fromStringId(this.file, dirEntryId);
};
/**
@ -318,7 +335,7 @@ ZIMArchive.prototype.findDirEntriesWithPrefix = function (search, callback, noIn
var prefixNameSpaces = '';
if (search.searchUrlIndex) {
var rgxSplitPrefix = /^[-ABCHIJMUVWX]\//;
if (that._file.zimType === 'zimit' && cns === 'C') {
if (that.zimType === 'zimit' && cns === 'C') {
// We have to account for the Zimit prefix in Type 1 ZIMs
rgxSplitPrefix = /^(?:[CMWX]\/)?(?:[AH]\/)?/;
}
@ -425,7 +442,7 @@ ZIMArchive.prototype.findDirEntriesWithPrefix = function (search, callback, noIn
ZIMArchive.prototype.getContentNamespace = function () {
var errorText;
if (this.isReady()) {
var ver = this._file.minorVersion;
var ver = this.file.minorVersion;
// DEV: There are currently only two defined values for minorVersion in the OpenZIM specification
// If this changes, adapt the error checking and return values
if (ver > 1) {
@ -455,11 +472,11 @@ ZIMArchive.prototype.findDirEntriesWithPrefixCaseSensitive = function (prefix, s
prefix = prefix || '';
var cns = this.getContentNamespace();
// Search v1 article listing if available, otherwise fallback to v0
var articleCount = this._file.articleCount || this._file.entryCount;
var searchFunction = appstate.selectedArchive._file.dirEntryByTitleIndex;
var articleCount = this.file.articleCount || this.file.entryCount;
var searchFunction = appstate.selectedArchive.file.dirEntryByTitleIndex;
if (search.searchUrlIndex) {
articleCount = this._file.entryCount;
searchFunction = appstate.selectedArchive._file.dirEntryByUrlIndex;
articleCount = this.file.entryCount;
searchFunction = appstate.selectedArchive.file.dirEntryByUrlIndex;
}
util.binarySearch(startIndex, articleCount, function(i) {
return searchFunction(i).then(function(dirEntry) {
@ -488,7 +505,7 @@ ZIMArchive.prototype.findDirEntriesWithPrefixCaseSensitive = function (prefix, s
});
}, true).then(function (firstIndex) {
var vDirEntries = [];
var addDirEntries = function(index, lastTitle) {
var addDirEntries = function (index, lastTitle) {
if (search.status === 'cancelled' || search.found >= search.size || index >= articleCount
|| lastTitle && !~lastTitle.indexOf(prefix) || index - firstIndex >= search.window) {
// DEV: Diagnostics to be removed before merge
@ -543,7 +560,7 @@ ZIMArchive.prototype.findDirEntriesFromFullTextSearch = function (search, dirEnt
// We give ourselves an overhead in caclulating the results needed, because full-text search will return some results already found
// var resultsNeeded = Math.floor(params.maxSearchResultsSize - dirEntries.length / 2);
var resultsNeeded = number || params.maxSearchResultsSize;
return this.callLibzimWorker({action: "search", text: search.prefix, numResults: resultsNeeded}).then(function (results) {
return this.callLibzimWorker({ action: 'search', text: search.prefix, numResults: resultsNeeded }).then(function (results) {
if (results) {
var dirEntryPaths = [];
var fullTextPaths = [];
@ -614,10 +631,10 @@ ZIMArchive.prototype.callLibzimWorker = function (parameters) {
* @param {DirEntry} dirEntry
* @param {callbackDirEntry} callback
*/
ZIMArchive.prototype.resolveRedirect = function(dirEntry, callback) {
ZIMArchive.prototype.resolveRedirect = function (dirEntry, callback) {
var that = this;
this._file.dirEntryByUrlIndex(dirEntry.redirectTarget).then(function (resolvedDirEntry) {
if (that._file.zimType === 'zimit') resolvedDirEntry = transformZimit.filterReplayFiles(resolvedDirEntry);
this.file.dirEntryByUrlIndex(dirEntry.redirectTarget).then(function (resolvedDirEntry) {
if (that.zimType === 'zimit') resolvedDirEntry = transformZimit.filterReplayFiles(resolvedDirEntry);
callback(resolvedDirEntry);
});
};
@ -632,7 +649,7 @@ ZIMArchive.prototype.resolveRedirect = function(dirEntry, callback) {
* @param {DirEntry} dirEntry
* @param {callbackStringContent} callback
*/
ZIMArchive.prototype.readUtf8File = function(dirEntry, callback) {
ZIMArchive.prototype.readUtf8File = function (dirEntry, callback) {
var cns = appstate.selectedArchive.getContentNamespace();
return dirEntry.readData().then(function(data) {
var mimetype = dirEntry.getMimetype();
@ -691,7 +708,7 @@ ZIMArchive.prototype.readUtf8File = function(dirEntry, callback) {
* @param {DirEntry} dirEntry
* @param {callbackBinaryContent} callback
*/
ZIMArchive.prototype.readBinaryFile = function(dirEntry, callback) {
ZIMArchive.prototype.readBinaryFile = function (dirEntry, callback) {
var that = this;
return dirEntry.readData().then(function(data) {
var mimetype = dirEntry.getMimetype();
@ -720,7 +737,7 @@ ZIMArchive.prototype.readBinaryFile = function(dirEntry, callback) {
* @param {String} originalPath Optional string used internally to prevent infinite loop
* @return {Promise<DirEntry>} A Promise that resolves to a Directory Entry, or null if not found.
*/
ZIMArchive.prototype.getDirEntryByPath = function(path, zimitResolving, originalPath) {
ZIMArchive.prototype.getDirEntryByPath = function (path, zimitResolving, originalPath) {
var that = this;
if (originalPath) appstate.originalPath = originalPath;
path = path.replace(/\?kiwix-display/, '');
@ -729,14 +746,14 @@ ZIMArchive.prototype.getDirEntryByPath = function(path, zimitResolving, original
var revisedPath = path.replace(/.*?((?:C\/A|A)\/(?!.*(?:C\/A|A)).+)$/, '$1');
if (revisedPath !== path) {
console.warn('*** Revised path from ' + path + '\nto: ' + revisedPath + ' ***');
if (appstate.selectedArchive._file.zimType === 'zimit') {
if (appstate.selectedArchive.zimType === 'zimit') {
console.debug('*** DEV: Consider correcting this error in tranformZimit.js ***');
}
path = revisedPath;
}
}
return util.binarySearch(0, this._file.entryCount, function(i) {
return that._file.dirEntryByUrlIndex(i).then(function(dirEntry) {
return util.binarySearch(0, this.file.entryCount, function(i) {
return that.file.dirEntryByUrlIndex(i).then(function(dirEntry) {
var url = dirEntry.namespace + "/" + dirEntry.url;
if (path < url) {
return -1;
@ -748,15 +765,15 @@ ZIMArchive.prototype.getDirEntryByPath = function(path, zimitResolving, original
});
}).then(function (index) {
if (index === null) return null;
return that._file.dirEntryByUrlIndex(index);
return that.file.dirEntryByUrlIndex(index);
}).then(function (dirEntry) {
// Filter Zimit dirEntries and do somee initial transforms
if (that._file.zimType === 'zimit') {
if (that.zimType === 'zimit') {
dirEntry = transformZimit.filterReplayFiles(dirEntry);
}
if (!dirEntry) {
// We couldn't get the dirEntry, so look it up the Zimit header
if (!zimitResolving && that._file.zimType === 'zimit' && !/^(H|C\/H)\//.test(path) && path !== appstate.originalPath) {
if (!zimitResolving && that.zimType === 'zimit' && !/^(H|C\/H)\//.test(path) && path !== appstate.originalPath) {
// We need to look the file up in the Header namespace (double replacement ensures both types of ZIM are supported)
var oldPath = path;
path = path.replace(/^A\//, 'H/').replace(/^(C\/)A\//, '$1H/');
@ -831,9 +848,9 @@ function fuzzySearch(path, search) {
*/
ZIMArchive.prototype.getRandomDirEntry = function (callback) {
// Prefer an article-only (v1) title pointer list, if available
var articleCount = this._file.articleCount || this._file.entryCount;
var articleCount = this.file.articleCount || this.file.entryCount;
var index = Math.floor(Math.random() * articleCount);
this._file.dirEntryByTitleIndex(index).then(callback);
this.file.dirEntryByTitleIndex(index).then(callback);
};
/**
@ -869,7 +886,7 @@ ZIMArchive.prototype.addMetadataToZIMFile = function (key) {
return new Promise(function (resolve, reject) {
that.getMetadata(key, function (data) {
data = data || '';
that._file[lcaseKey] = data;
that[lcaseKey] = data;
resolve(data);
});
});
@ -877,4 +894,4 @@ ZIMArchive.prototype.addMetadataToZIMFile = function (key) {
export default {
ZIMArchive: ZIMArchive
};
};

View File

@ -2,24 +2,28 @@
* zimfile.js: Low-level ZIM file reader.
*
* Copyright 2015 Mossroy and contributors
* License GPL v3:
* Licence GPL v3:
*
* This file is part of Kiwix.
*
* Kiwix is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* it under the terms of the GNU General Public Licence as published by
* the Free Software Foundation, either version 3 of the Licence, or
* (at your option) any later version.
*
* Kiwix is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* GNU General Public Licence for more details.
*
* You should have received a copy of the GNU General Public License
* You should have received a copy of the GNU General Public Licence
* along with Kiwix (file LICENSE-GPLv3.txt). If not, see <http://www.gnu.org/licenses/>
*/
'use strict';
/* global params, appstate */
import xz from './xzdec_wrapper.js';
import zstd from './zstddec_wrapper.js';
import util from './util.js';
@ -27,10 +31,6 @@ import utf8 from './utf8.js';
import zimDirEntry from './zimDirEntry.js';
import FileCache from './filecache.js';
/* global params, appstate */
'use strict';
/**
* This code makes an assumption that no Directory Entry will be larger that MAX_SUPPORTED_DIRENTRY_SIZE bytes.
* If a larger dirEntry is encountered, a warning will display in console. Increase this value if necessary.
@ -112,7 +112,6 @@ var readInt = function (data, offset, size) {
* @property {Integer} mimeListPos Position of the MIME type list (also header size)
* @property {Integer} mainPage Main page or 0xffffffff if no main page
* @property {Integer} layoutPage Layout page or 0xffffffffff if no layout page
* @property {String} zimType Extended property: currently either 'open' for OpenZIM file type, or 'zimit' for the warc2zim file type used by Zimit (set in zimArchive.js)
* @property {Map} mimeTypes Extended property: the ZIM file's MIME type table rendered as a Map (calculated entry)
*/
@ -230,7 +229,7 @@ ZIMFile.prototype.dirEntry = function (offset) {
* @returns {Promise<DirEntry>} A Promise for the requested DirEntry
*/
ZIMFile.prototype.dirEntryByUrlIndex = function (index) {
var that = appstate.selectedArchive._file;
var that = appstate.selectedArchive.file;
if (!that) return Promise.resolve(null);
return that._readInteger(that.urlPtrPos + index * 8, 8).then(function (dirEntryPos) {
return that.dirEntry(dirEntryPos);
@ -243,7 +242,7 @@ ZIMFile.prototype.dirEntryByUrlIndex = function (index) {
* @returns {Promise<DirEntry>} A Promise for the requested DirEntry
*/
ZIMFile.prototype.dirEntryByTitleIndex = function (index) {
var that = appstate.selectedArchive._file;
var that = appstate.selectedArchive.file;
// Use v1 title pointerlist if available, or fall back to legacy v0 list
var ptrList = that.articlePtrPos || that.titlePtrPos;
return that._readInteger(ptrList + index * 4, 4).then(function (urlIndex) {
@ -333,7 +332,6 @@ ZIMFile.prototype.setListings = function (listings) {
// If we are in a legacy ZIM archive, we need to calculate the true article count (of entries in the A namespace)
// This effectively emulates the v1 article pointerlist
if (this.minorVersion === 0) {
// console.debug('ZIM DirListing version: 0 (legacy)', this);
// Initiate a binary search for the first or last article
var getArticleIndexByOrdinal = function (ordinal) {
return util.binarySearch(0, that.entryCount, function (i) {