/**
* zimArchive.js: Support for archives in ZIM format.
*
* Copyright 2015 Mossroy and contributors
* License GPL v3:
*
* This file is part of Kiwix.
*
* Kiwix is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Kiwix is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Kiwix (file LICENSE-GPLv3.txt). If not, see
*/
'use strict';
/* global params */
import zimfile from './zimfile.js';
import zimDirEntry from './zimDirEntry.js';
import transformZimit from './transformZimit.js';
import util from './util.js';
import uiUtil from './uiUtil.js';
import utf8 from './utf8.js';
/**
* ZIM Archive
*
*
* @typedef ZIMArchive
* @property {ZIMFile} _file The ZIM file (instance of ZIMFile, that might physically be split into several actual files)
* @property {String} _language Language of the content
*/
/**
* @callback callbackZIMArchive
* @param {ZIMArchive} zimArchive Ready-to-use ZIMArchive
*/
/**
* @callback callbackMetadata
* @param {String} data metadata string
*/
/**
* @param {Worker} LZ A Web Worker to run the libzim Web Assembly binary
*/
var LZ;
/**
* Creates a ZIM archive object to access the ZIM file at the given path in the given storage.
* This constructor can also be used with a single File parameter.
*
* @param {StorageFirefoxOS|Array} storage Storage (in this case, the path must be given) or Array of Files (path parameter must be omitted)
* @param {String} path The Storage path for an OS that requires this to be specified
* @param {callbackZIMArchive} callbackReady The function to call when the archive is ready to use
* @param {callbackZIMArchive} callbackError The function to call when an error occurs
*/
function ZIMArchive (storage, path, callbackReady, callbackError) {
var that = this;
that._file = null;
that._language = ''; // @TODO
var createZimfile = function (fileArray) {
zimfile.fromFileArray(fileArray).then(function (file) {
that._file = file;
// Clear the previous libzimWoker
LZ = null;
// Set a global parameter to report the search provider type
params.searchProvider = 'title';
// File has been created, but we need to add any Listings which extend the archive metadata
that._file.setListings([
// Provide here any Listings for which we need to extract metadata as key:value obects to be added to the file
// 'ptrName' and 'countName' contain the key names to be set in the archive file object
{
// This defines the standard v0 (legacy) title index that contains listings for every entry in the ZIM (not just articles)
// It represents the same index that is referenced in the ZIM archive header
path: 'X/listing/titleOrdered/v0',
ptrName: 'titlePtrPos',
countName: 'entryCount'
},
{
// This defines a new version 1 index that is present in no-namespace ZIMs, and contains a title-ordered list of articles
path: 'X/listing/titleOrdered/v1',
ptrName: 'articlePtrPos',
countName: 'articleCount'
},
{
// This tests for and specifies the existence of any Xapian Full Text Index
path: 'X/fulltext/xapian',
ptrName: 'fullTextIndex',
countName: 'fullTextIndexSize'
}
]).then(function () {
// There is currently an exception thrown in the libzim wasm if we attempt to load a split ZIM archive, so we work around
var isSplitZim = /\.zima.$/i.test(that._file._files[0].name);
var libzimReaderType = params.debugLibzimASM || ('WebAssembly' in self ? 'wasm' : 'asm');
if (that._file.fullTextIndex && params.debugLibzimASM !== 'disable' && (params.debugLibzimASM || !isSplitZim &&
// The ASM implementation requires Atomics support, whereas the WASM implementation does not
(typeof Atomics !== 'undefined' || libzimReaderType === 'wasm') &&
// Note that Android and NWJS currently throw due to problems with Web Worker context
!/Android/.test(params.appType) && !(window.nw && that._file._files[0].readMode === 'electron'))) {
console.log('Instantiating libzim ' + libzimReaderType + ' Web Worker...');
LZ = new Worker('js/lib/libzim-' + libzimReaderType + '.js');
that.callLibzimWorker({ action: 'init', files: that._file._files }).then(function (msg) {
// console.debug(msg);
params.searchProvider = 'fulltext: ' + libzimReaderType;
// Update the API panel
uiUtil.reportSearchProviderToAPIStatusPanel(params.searchProvider);
}).catch(function (err) {
uiUtil.reportSearchProviderToAPIStatusPanel(params.searchProvider + ': ERROR');
console.error('The libzim worker could not be instantiated!', err);
});
} else {
// var message = 'Full text searching is not available because ';
if (!that._file.fullTextIndex) {
params.searchProvider += ': no_fulltext'; // message += 'this ZIM does not have a full-text index.';
} else if (isSplitZim) {
params.searchProvider += ': split_zim'; // message += 'the ZIM archive is split.';
} else if (typeof Atomics === 'undefined') {
params.searchProvider += ': no_atomics'; // message += 'this browser does not support Atomic operations.';
} else if (/Android/.test(params.appType)) {
params.searchProvider += ': no_sharedArrayBuffer';
} else if (params.debugLibzimASM === 'disable') {
params.searchProvider += ': disabled';
} else {
params.searchProvider += ': unknown';
}
uiUtil.reportSearchProviderToAPIStatusPanel(params.searchProvider);
// uiUtil.systemAlert(message);
}
});
// Set the archive file type ('open' or 'zimit')
params.zimType = that.setZimType();
// DEV: Currently, extended listings are only used for title (=article) listings when the user searches
// for an article or uses the Random button, by which time the listings will have been extracted.
// If, in the future, listings are used in a more time-critical manner, consider forcing a wait before
// declaring the archive to be ready, by chaining the following callback in a .then() function of setListings.
callbackReady(that);
});
};
if (storage && !path) {
var fileList = storage;
// We need to convert the FileList into an Array
var fileArray = [].slice.call(fileList);
// The constructor has been called with an array of File/Blob parameter
createZimfile(fileArray);
} else {
if (/.*zim..$/.test(path)) {
// split archive
that._searchArchiveParts(storage, path.slice(0, -2)).then(function (fileArray) {
createZimfile(fileArray);
}).catch(function (error) {
callbackError('Error reading files in split archive ' + path + ': ' + error, 'Error reading archive files');
});
} else {
storage.get(path).then(function (file) {
createZimfile([file]);
}).catch(function (error) {
callbackError('Error reading ZIM file ' + path + ' : ' + error, 'Error reading archive file');
});
}
}
}
/**
* Searches the directory for all parts of a split archive.
* @param {Storage} storage storage interface
* @param {String} prefixPath path to the split files, missing the "aa" / "ab" / ... suffix.
* @returns {Promise} that resolves to the array of file objects found.
*/
ZIMArchive.prototype._searchArchiveParts = function (storage, prefixPath) {
var fileArray = [];
var nextFile = function (part) {
var suffix = String.fromCharCode(0x61 + Math.floor(part / 26)) + String.fromCharCode(0x61 + part % 26);
return storage.get(prefixPath + suffix)
.then(function (file) {
fileArray.push(file);
return nextFile(part + 1);
}, function (error) {
console.error('Error reading split archive file ' + prefixPath + suffix + ': ', error);
return fileArray;
});
};
return nextFile(0);
};
/**
*
* @returns {Boolean}
*/
ZIMArchive.prototype.isReady = function () {
return this._file !== null;
};
/**
* Detects whether the supplied archive is a Zimit-style archive or an OpenZIM archive and
* sets a _file.zimType property accordingly; also returns the detected type. Extends ZIMFile.
* @returns {String} Either 'zimit' for a Zimit archive, or 'open' for an OpenZIM archive
*/
ZIMArchive.prototype.setZimType = function () {
var fileType = null;
if (this.isReady()) {
fileType = 'open';
this._file.mimeTypes.forEach(function (v) {
if (/warc-headers/i.test(v)) fileType = 'zimit';
});
this._file.zimType = fileType;
console.debug('Archive type set to: ' + fileType);
} else {
console.error('ZIMArchive is not ready! Cannot set ZIM type.');
}
return fileType;
};
/**
* Looks for the DirEntry of the main page
* @param {callbackDirEntry} callback
* @returns {Promise} that resolves to the DirEntry
*/
ZIMArchive.prototype.getMainPageDirEntry = function (callback) {
if (this.isReady()) {
var mainPageUrlIndex = this._file.mainPage;
var that = this;
this._file.dirEntryByUrlIndex(mainPageUrlIndex).then(function (dirEntry) {
// Filter out Zimit files that we cannot handle without error
if (that._file.zimType === 'zimit') dirEntry = transformZimit.filterReplayFiles(dirEntry);
callback(dirEntry);
});
}
};
/**
*
* @param {String} dirEntryId
* @returns {DirEntry}
*/
ZIMArchive.prototype.parseDirEntryId = function (dirEntryId) {
return zimDirEntry.DirEntry.fromStringId(this._file, dirEntryId);
};
/**
* @callback callbackDirEntryList
* @param {Array.} dirEntryArray Array of DirEntries found
*/
/**
* Look for DirEntries with title starting with the prefix of the current search object.
* For now, ZIM titles are case sensitive.
* So, as workaround, we try several variants of the prefix to find more results.
* This should be enhanced when the ZIM format will be modified to store normalized titles
* See https://phabricator.wikimedia.org/T108536
*
* @param {Object} search The current appstate.search object
* @param {callbackDirEntryList} callback The function to call with the result
* @param {Boolean} noInterim A flag to prevent callback until all results are ready (used in testing)
*/
ZIMArchive.prototype.findDirEntriesWithPrefix = function (search, callback, noInterim) {
var that = this;
// Establish array of initial values that must be searched first. All of these patterns are generated by the full
// search type, and some by basic, but we need the most common patterns to be searched first, as it returns search
// results much more quickly if we do this (and the user can click on a result before the rarer patterns complete)
// NB duplicates are removed before processing search array
var startArray = [];
var cns = this.getContentNamespace();
var dirEntries = [];
search.scanCount = 0;
// Check if user prefixed search with a namespace-like pattern. If so, do a search for namespace + url
if (/^[-ABCHIJMUVWX]\//.test(search.prefix)) search.searchUrlIndex = true;
// Regex below breaks the string into the pattern: group 1: alphanumericsearch; group 2: regex beginning with .* or .+, or contained in (?:regex)
var isPrefixRegExp = search.prefix.match(/^((?:[^(.]|\((?!\?:)|\.(?![*+]))*)(\(\?:.*\)|\.[*+].*)$/);
search.rgxPrefix = null;
var prefix = search.prefix;
// Launch a full-text search if possible
if (LZ && !search.searchUrlIndex) that.findDirEntriesFromFullTextSearch(search, dirEntries).then(function (fullTextDirEntries) {
// If user initiated a new search, cancel this one
// In particular, do not set the search status back to 'complete'
// as that would cause outdated results to unexpectedly pop up
if (search.status === 'cancelled') return callback([], search);
dirEntries = fullTextDirEntries;
search.status = 'complete';
callback(dirEntries, search);
});
if (isPrefixRegExp) {
// User has initiated a regular expression search - note the only regexp special character allowed in the alphanumeric part is \s
prefix = isPrefixRegExp[1].replace(/\\s/g, ' ');
var regexCorrect = true;
try {
search.rgxPrefix = new RegExp(isPrefixRegExp[2], 'i');
} catch (err) {
// User has incorrect regular expression syntax
regexCorrect = false;
}
if (!regexCorrect) {
search.status = 'error';
callback([], search);
return;
}
}
var prefixNameSpaces = '';
if (search.searchUrlIndex) {
var rgxSplitPrefix = /^[-ABCHIJMUVWX]\//;
if (that._file.zimType === 'zimit' && cns === 'C') {
// We have to account for the Zimit prefix in Type 1 ZIMs
rgxSplitPrefix = /^(?:[CMWX]\/)?(?:[AH]\/)?/;
}
var splitPrefix = prefix.match(rgxSplitPrefix);
prefixNameSpaces = splitPrefix ? splitPrefix[0] : '';
var splitSuffix = prefix.split(rgxSplitPrefix);
prefix = splitSuffix ? splitSuffix[1] : prefix;
}
// Ensure a search is done on the string exactly as typed
startArray.push(prefix);
// Normalize any spacing and make string all lowercase
prefix = prefix.replace(/\s+/g, ' ').toLocaleLowerCase();
// Add lowercase string with initial uppercase (this is a very common pattern)
startArray.push(prefix.replace(/^./, function (m) {
return m.toLocaleUpperCase();
}));
// Add pure lowercase string (rarer)
startArray.push(prefix);
// Add a case-insensitive search for the string (pseudo-regex notation)
startArray.push('/' + prefix + '/i');
// Get the full array of combinations to check number of combinations
var fullCombos = util.removeDuplicateStringsInSmallArray(util.allCaseFirstLetters(prefix, 'full'));
// Put cap on exponential number of combinations (five words = 3^5 = 243 combinations)
search.type = fullCombos.length < 300 ? 'full' : 'basic';
// We have to remove duplicate string combinations because util.allCaseFirstLetters() can return some combinations
// where uppercase and lowercase combinations are exactly the same, e.g. where prefix begins with punctuation
// or currency signs, for languages without case, or where user-entered case duplicates calculated case
var prefixVariants = util.removeDuplicateStringsInSmallArray(
startArray.concat(
// Get basic combinations first for speed of returning results
util.allCaseFirstLetters(prefix).concat(
search.type === 'full' ? fullCombos : []
)
)
);
function searchNextVariant () {
// If user has initiated a new search, cancel this one
if (search.status === 'cancelled') return callback([], search);
var remaining = search.size - dirEntries.length;
if (prefixVariants.length === 0 || remaining < 1) {
// We have found all the title-search entries we are going to get, so indicate search type if we're still searching
if (LZ && !search.searchUrlIndex && search.status !== 'complete') search.type = 'fulltext';
else if (LZ && search.searchUrlIndex && remaining > 0) {
search.type = 'fulltext';
that.findDirEntriesFromFullTextSearch(search, dirEntries, remaining).then(function (fullTextDirEntries) {
if (search.status === 'cancelled') return callback([], search);
dirEntries = fullTextDirEntries;
search.status = 'complete';
callback(dirEntries, search);
});
}
else search.status = 'complete';
return callback(dirEntries, search);
}
// Dynamically populate list of articles
search.status = 'interim';
if (!noInterim) callback(dirEntries, search);
search.found = dirEntries.length;
var prefix = prefixNameSpaces + prefixVariants[0];
search.lc = false;
// If it's pseudo-regex with a case-insensitive flag like '/my search/i', do an enhanced case-insensitive search
if (/^\/.+\/i$/.test(prefixVariants[0])) {
search.lc = true;
prefix = prefixNameSpaces + prefixVariants[0].replace(/^\/(.+)\/i/, '$1').toLocaleLowerCase();
console.debug('Searching case-insensitively for: "' + prefix + '"');
}
// Remove in-progress search variant from array
prefixVariants = prefixVariants.slice(1);
// Search window sets an upper limit on how many matching dirEntries will be scanned in a full index search
search.window = search.rgxPrefix ? 10000 * search.size : search.size;
that.findDirEntriesWithPrefixCaseSensitive(prefix, search,
function (newDirEntries, countReport, interim) {
search.countReport = countReport;
if (search.status === 'cancelled') return callback([], search);
if (!noInterim && countReport === true) return callback(dirEntries, search);
// Only push interim results to the dirEntries array (otherwise we get a duplicated array when the final results are reported to this function)
if (interim) {
// Collect all the found paths for the dirEntries so far
var dirEntryPaths = [];
for (var i = 0; i < dirEntries.length; i++) {
dirEntryPaths.push(dirEntries[i].url);
}
// Push new directory entries to the end of the global array so long as they are not duplicates
for (var j = 0; j < newDirEntries.length; j++) {
if (~dirEntryPaths.indexOf(newDirEntries[j].url)) continue;
dirEntries.push(newDirEntries[j]);
}
search.found = dirEntries.length;
if (!noInterim && newDirEntries.length) return callback(dirEntries, search);
} else return searchNextVariant();
}
);
}
searchNextVariant();
};
/**
* A method to return the namespace in the ZIM file that contains the primary user content. In old-format ZIM files (minor
* version 0) there are a number of content namespaces, but the primary one in which to search for titles is 'A'. In new-format
* ZIMs (minor version 1) there is a single content namespace 'C'. See https://openzim.org/wiki/ZIM_file_format. This method
* throws an error if it cannot determine the namespace or if the ZIM is not ready.
* @returns {String} The content namespace for the ZIM archive
*/
ZIMArchive.prototype.getContentNamespace = function () {
var errorText;
if (this.isReady()) {
var ver = this._file.minorVersion;
// DEV: There are currently only two defined values for minorVersion in the OpenZIM specification
// If this changes, adapt the error checking and return values
if (ver > 1) {
errorText = 'Unknown ZIM minor version!';
} else {
return ver === 0 ? 'A' : 'C';
}
} else {
errorText = 'We could not determine the content namespace because the ZIM file is not ready!';
}
throw new Error(errorText);
};
/**
* Look for dirEntries with title starting with the given prefix (case-sensitive)
*
* @param {String} prefix The case-sensitive value against which dirEntry titles (or url) will be compared
* @param {Object} search The appstate.search object (for comparison, so that we can cancel long binary searches)
* @param {callbackDirEntryList} callback The function to call with the array of dirEntries with titles that begin with prefix
* @param {Integer} startIndex The index number with which to commence the search, or null
*/
ZIMArchive.prototype.findDirEntriesWithPrefixCaseSensitive = function (prefix, search, callback, startIndex) {
// Save the value of startIndex because value of null has a special meaning in combination with prefix:
// produces a list of matches starting with first match and then next x dirEntries thereafter
var saveStartIndex = startIndex;
startIndex = startIndex || 0;
prefix = prefix || '';
var cns = this.getContentNamespace();
// Search v1 article listing if available, otherwise fallback to v0
var articleCount = this._file.articleCount || this._file.entryCount;
var searchFunction = appstate.selectedArchive._file.dirEntryByTitleIndex;
if (search.searchUrlIndex) {
articleCount = this._file.entryCount;
searchFunction = appstate.selectedArchive._file.dirEntryByUrlIndex;
}
util.binarySearch(startIndex, articleCount, function(i) {
return searchFunction(i).then(function(dirEntry) {
if (search.status === 'cancelled') return 0;
var ns = dirEntry.namespace;
var ti = search.searchUrlIndex ? dirEntry.url : dirEntry.getTitleOrUrl();
if (!search.searchUrlIndex) {
// DEV: This search is redundant if we managed to populate articlePtrLst and articleCount, but it only takes two instructions and
// provides maximum compatibility with rare ZIMs where attempts to find first and last article (in zimArchive.js) may have failed
if (ns < cns) return 1;
if (ns > cns) return -1;
// We should now be in namespace A (old format ZIM) or C (new format ZIM)
if (search.lc) { // Search comparator should be lowercase (for case-insensitive search)
ti = ti.toLocaleLowerCase();
prefix = prefix.toLocaleLowerCase();
}
return prefix <= ti ? -1 : 1;
} else {
if (search.lc) { // Search comparator should be lowercase (for case-insensitive search)
ns = ns + '/' + ti.replace(/^((?:[AH])?)\/?.*/, '$1');
ti = ti.replace(/^[AH]\//, '').toLocaleLowerCase();
}
// if (search.rgxPrefix && search.rgxPrefix.test(ti)) return -1;
return prefix <= (ns + '/' + ti) ? -1 : 1;
}
});
}, true).then(function (firstIndex) {
var vDirEntries = [];
var addDirEntries = function(index, lastTitle) {
if (search.status === 'cancelled' || search.found >= search.size || index >= articleCount
|| lastTitle && !~lastTitle.indexOf(prefix) || index - firstIndex >= search.window) {
// DEV: Diagnostics to be removed before merge
if (vDirEntries.length) {
console.debug('Scanned ' + (index - firstIndex) + ' titles for "' + prefix +
'" (found ' + vDirEntries.length + ' match' + (vDirEntries.length === 1 ? ')' : 'es)'));
}
return {
dirEntries: vDirEntries,
nextStart: index
};
}
return searchFunction(index).then(function (dirEntry) {
search.scanCount++;
var title = dirEntry.getTitleOrUrl();
// If we are searching by URL, display namespace also
if (search.searchUrlIndex) title = dirEntry.namespace + '/' + dirEntry.url;
if (search.lc && !search.rgxPrefix) { // Search comparator should be lowercase if not using regex (for case-insensitive search)
var ns = title.replace(/^((?:C\/)?(?:[AH]\/)?).*/, '$1');
title = ns + title.replace(ns, '').toLocaleLowerCase();
}
// Only return dirEntries with titles that actually begin with prefix
if (saveStartIndex === null || (search.searchUrlIndex || dirEntry.namespace === cns) && title.indexOf(prefix) === 0) {
if (!search.rgxPrefix || search.rgxPrefix && search.rgxPrefix.test(title)) { // Regex test case-insensitive if i flag set
vDirEntries.push(dirEntry);
// Report interim result
if (typeof saveStartIndex === 'undefined') callback([dirEntry], false, true);
}
}
// Report number of titles scanned every 5000 titles
if (!(search.scanCount % 5000) && typeof saveStartIndex === 'undefined') callback([], true, true);
return addDirEntries(index + 1, title);
});
};
return addDirEntries(firstIndex);
}).then(function(objWithIndex) {
return callback(objWithIndex.dirEntries, objWithIndex.nextStart);
});
};
/**
* Find Directory Entries corresponding to the requested search using Full Text search provided by libzim
*
* @param {Object} search The appstate.search object
* @param {Array} dirEntries The array of already found Directory Entries
* @param {Integer} number Optional positive number of search results requested (otherwise params.maxSearchResults will be used)
* @returns {Promise} The augmented array of Directory Entries with titles that correspond to search
*/
ZIMArchive.prototype.findDirEntriesFromFullTextSearch = function (search, dirEntries, number) {
var cns = this.getContentNamespace();
var that = this;
// We give ourselves an overhead in caclulating the results needed, because full-text search will return some results already found
// var resultsNeeded = Math.floor(params.maxSearchResultsSize - dirEntries.length / 2);
var resultsNeeded = number || params.maxSearchResultsSize;
return this.callLibzimWorker({action: "search", text: search.prefix, numResults: resultsNeeded}).then(function (results) {
if (results) {
var dirEntryPaths = [];
var fullTextPaths = [];
// Collect all the found paths for the dirEntries
for (var i = 0; i < dirEntries.length; i++) {
dirEntryPaths.push(dirEntries[i].namespace + '/' + dirEntries[i].url);
}
// Collect all the paths for full text search, pruning as we go
var path;
for (var j = 0; j < results.entries.length; j++) {
search.scanCount++;
path = results.entries[j].path;
// Full-text search result paths are missing the namespace in Type 1 ZIMs, so we add it back
path = cns === 'C' ? cns + '/' + path : path;
if (~dirEntryPaths.indexOf(path)) continue;
fullTextPaths.push(path);
}
var promisesForDirEntries = [];
for (var k = 0; k < fullTextPaths.length; k++) {
promisesForDirEntries.push(that.getDirEntryByPath(fullTextPaths[k]));
}
return Promise.all(promisesForDirEntries).then(function (fullTextDirEntries) {
for (var l = 0; l < fullTextDirEntries.length; l++) {
dirEntries.push(fullTextDirEntries[l]);
}
return dirEntries;
});
} else {
return dirEntries;
}
});
};
/**
* Calls the libzim Web Worker with the given parameters, and returns a Promise with its response
*
* @param {Object} parameters
* @returns {Promise}
*/
ZIMArchive.prototype.callLibzimWorker = function (parameters) {
return new Promise(function (resolve, reject) {
console.debug('Calling libzim WebWorker with parameters', parameters);
var tmpMessageChannel = new MessageChannel();
// var t0 = performance.now();
tmpMessageChannel.port1.onmessage = function (event) {
// var t1 = performance.now();
// var readTime = Math.round(t1 - t0);
// console.debug("Response given by the WebWorker in " + readTime + " ms", event.data);
resolve(event.data);
};
tmpMessageChannel.port1.onerror = function (event) {
// var t1 = performance.now();
// var readTime = Math.round(t1 - t0);
// console.error("Error sent by the WebWorker in " + readTime + " ms", event.data);
reject(event.data);
};
LZ.postMessage(parameters, [tmpMessageChannel.port2]);
});
};
/**
* @callback callbackDirEntry
* @param {DirEntry} dirEntry The DirEntry found
*/
/**
*
* @param {DirEntry} dirEntry
* @param {callbackDirEntry} callback
*/
ZIMArchive.prototype.resolveRedirect = function(dirEntry, callback) {
var that = this;
this._file.dirEntryByUrlIndex(dirEntry.redirectTarget).then(function (resolvedDirEntry) {
if (that._file.zimType === 'zimit') resolvedDirEntry = transformZimit.filterReplayFiles(resolvedDirEntry);
callback(resolvedDirEntry);
});
};
/**
* @callback callbackStringContent
* @param {String} content String content
*/
/**
*
* @param {DirEntry} dirEntry
* @param {callbackStringContent} callback
*/
ZIMArchive.prototype.readUtf8File = function(dirEntry, callback) {
var cns = appstate.selectedArchive.getContentNamespace();
return dirEntry.readData().then(function(data) {
var mimetype = dirEntry.getMimetype();
if (window.TextDecoder) {
data = new TextDecoder('utf-8').decode(data);
} else {
// Support for IE11 and Edge Legacy - only support UTF-8 decoding
data = utf8.parse(data);
}
if (/\bx?html\b/i.test(mimetype)) {
// If the data were encoded with a different mimtype, here is how to change it
// var encoding = decData.match(/]+?Content-Type[^>]+?charset=([^'"\s]+)/i);
// encoding = encoding ? encoding[1] : '';
// if (encoding && !/utf-8/i.test(encoding)) decData = new TextDecoder(encoding).decode(data);
//Some Zimit assets have moved location and we need to follow the moved permanently data
if (/301\s*moved\s+permanently/i.test(data)) dirEntry = transformZimit.getZimitRedirect(dirEntry, data, cns);
// Some Zimit archives have an incorrect meta charset tag. See https://github.com/openzim/warc2zim/issues/88.
// So we remove it!
data = data.replace(/]+?Content-Type[^>]+?charset=([^'"\s]+)[^>]+>\s*/i, function (m0, m1) {
if (!/utf-8/i.test(m1)) {
return '';
}
return m0;
});
}
if (dirEntry.inspect || dirEntry.zimitRedirect) {
if (dirEntry.inspect) dirEntry = transformZimit.getZimitRedirect(dirEntry, data, cns);
if (dirEntry.zimitRedirect) {
return appstate.selectedArchive.getDirEntryByPath(dirEntry.zimitRedirect).then(function (rd) {
return appstate.selectedArchive.readUtf8File(rd, callback);
});
}
} else {
// DEV: Note that we cannot terminate regex below with $ because there is a (rogue?) mimetype
// of 'text/html;raw=true'
if (params.zimType === 'zimit' && /\/(?:html|css|javascript)\b/i.test(mimetype)) {
data = transformZimit.transformReplayUrls(dirEntry, data, mimetype);
}
callback(dirEntry, data);
}
}).catch(function (e) {
console.error('Error reading directory entry', e);
callback(dirEntry, '');
});
};
/**
* @callback callbackBinaryContent
* @param {Uint8Array} content binary content
*/
/**
* Read a binary file.
* @param {DirEntry} dirEntry
* @param {callbackBinaryContent} callback
*/
ZIMArchive.prototype.readBinaryFile = function(dirEntry, callback) {
var that = this;
return dirEntry.readData().then(function(data) {
var mimetype = dirEntry.getMimetype();
if (dirEntry.inspect) {
dirEntry = transformZimit.getZimitRedirect(dirEntry, utf8.parse(data), appstate.selectedArchive.getContentNamespace());
if (dirEntry.zimitRedirect) {
return appstate.selectedArchive.getDirEntryByPath(dirEntry.zimitRedirect).then(function (rd) {
return appstate.selectedArchive.readBinaryFile(rd, callback);
})
}
} else {
// DEV: Note that we cannot terminate regex below with $ because there is a (rogue?) mimetype
// of 'text/html;raw=true'
if (params.zimType === 'zimit' && /\/(?:html|css|javascript)\b/i.test(mimetype)) {
data = transformZimit.transformReplayUrls(dirEntry, utf8.parse(data), mimetype);
}
callback(dirEntry, data);
}
});
};
/**
* Searches the URL pointer list of Directory Entries by pathname
* @param {String} path The pathname of the DirEntry that is required (namespace + filename)
* @param {Boolean} zimitResolving A flag to indicate that the a Zimit path is in a lookup loop
* @param {String} originalPath Optional string used internally to prevent infinite loop
* @return {Promise} A Promise that resolves to a Directory Entry, or null if not found.
*/
ZIMArchive.prototype.getDirEntryByPath = function(path, zimitResolving, originalPath) {
var that = this;
if (originalPath) appstate.originalPath = originalPath;
path = path.replace(/\?kiwix-display/, '');
// Correct obvious errors
if (!originalPath) {
var revisedPath = path.replace(/.*?((?:C\/A|A)\/(?!.*(?:C\/A|A)).+)$/, '$1');
if (revisedPath !== path) {
console.warn('*** Revised path from ' + path + '\nto: ' + revisedPath + ' ***');
if (appstate.selectedArchive._file.zimType === 'zimit') {
console.debug('*** DEV: Consider correcting this error in tranformZimit.js ***');
}
path = revisedPath;
}
}
return util.binarySearch(0, this._file.entryCount, function(i) {
return that._file.dirEntryByUrlIndex(i).then(function(dirEntry) {
var url = dirEntry.namespace + "/" + dirEntry.url;
if (path < url) {
return -1;
} else if (path > url) {
return 1;
} else {
return 0;
}
});
}).then(function (index) {
if (index === null) return null;
return that._file.dirEntryByUrlIndex(index);
}).then(function (dirEntry) {
// Filter Zimit dirEntries and do somee initial transforms
if (that._file.zimType === 'zimit')
dirEntry = transformZimit.filterReplayFiles(dirEntry);
if (!dirEntry) {
// We couldn't get the dirEntry, so look it up the Zimit header
if (!zimitResolving && that._file.zimType === 'zimit' && !/^(H|C\/H)\//.test(path) && path !== appstate.originalPath) {
// We need to look the file up in the Header namespace (double replacement ensures both types of ZIM are supported)
var oldPath = path;
path = path.replace(/^A\//, 'H/').replace(/^(C\/)A\//, '$1H/');
console.debug('DirEntry ' + oldPath + ' not found, looking up header: ' + path);
return that.getDirEntryByPath(path, true, oldPath);
// } else if (zimitResolving) {
} else if (zimitResolving && appstate.originalPath && appstate.originalPath === appstate.expectedArticleURLToBeDisplayed) {
// We couldn't find the Header, so try a fuzzy search only if the user is loading an article
path = appstate.originalPath;
var ns = path.replace(/^((?:C\/)?A\/).*/, '$1'); // If Zimit pseudo-namespaces are changed, will need to edit this
path = path.replace(ns, '');
path = path.toLocaleLowerCase(); // We are going to combine case-insensitive string comparison with regex matching
var rgxPath = path.replace(/([-/?.$^|*+()[{])/g, '\\$1'); // Make sure we escape regex characters
path = ns + path; // Add namespace back to path for full matching
// path = ns;
var search = {
rgxPrefix: new RegExp('.*' + rgxPath, 'i'),
searchUrlIndex: true,
lc: true, // Make the comparator (e.g. dirEntry.url) lowercase
size: 1,
found: 0
}
return fuzzySearch(path, search);
} else {
var newpath = path.replace(/^((?:A|C\/A)\/)[^/]+\/(.+)$/, '$1$2');
if (newpath === path) return null; // No further paths to explore!
console.log("Article " + path + " not available, but moving up one directory to compensate for ZIM coding error...");
return that.getDirEntryByPath(newpath);
}
} else {
// DEBUG: List found Directory Entry
// if (dirEntry) console.debug('Found ' + path);
return dirEntry;
}
});
};
/**
* Initiate a fuzzy search for dirEntries matching the search object
* @param {String} path Human-readable path to search for
* @param {Object} search The search object
* @returns {Promise} A Promise that resolves to a Directory Entry, or null if not found
*/
function fuzzySearch(path, search) {
return new Promise(function (resolve, reject) {
console.log('Initiating fuzzy search for ' + path + '...');
uiUtil.pollSpinner('Fuzzy search for ' + path + '...', true);
var searchResolved = false;
// setTimeout(function () {
// if (!searchResolved) uiUtil.pollSpinner('Fuzzy search for ' + path + '...', true);
// }, 5000);
appstate.selectedArchive.findDirEntriesWithPrefixCaseSensitive(path, search, function (dirEntry) {
if (!search.found && dirEntry && dirEntry[0] && dirEntry[0].url) {
search.found++;
dirEntry = dirEntry[0];
dirEntry = transformZimit.filterReplayFiles(dirEntry);
if (dirEntry) console.debug('Found ' + dirEntry.url + ' in fuzzy search');
searchResolved = true;
resolve(dirEntry);
} else {
console.debug('No fuzzy search results found');
searchResolved = true;
resolve(null);
}
}, null);
});
}
/**
*
* @param {callbackDirEntry} callback
*/
ZIMArchive.prototype.getRandomDirEntry = function (callback) {
// Prefer an article-only (v1) title pointer list, if available
var articleCount = this._file.articleCount || this._file.entryCount;
var index = Math.floor(Math.random() * articleCount);
this._file.dirEntryByTitleIndex(index).then(callback);
};
/**
* Read a Metadata string inside the ZIM file.
* @param {String} key
* @param {callbackMetadata} callback
*/
ZIMArchive.prototype.getMetadata = function (key, callback) {
var that = this;
this.getDirEntryByPath('M/' + key).then(function (dirEntry) {
if (dirEntry === null || dirEntry === undefined) {
console.warn('Title M/' + key + ' not found in the archive');
callback();
} else {
that.readUtf8File(dirEntry, function (dirEntryRead, data) {
callback(data);
});
}
}).catch(function (e) {
console.warn('Metadata with key ' + key + ' not found in the archive', e);
callback();
});
};
export default {
ZIMArchive: ZIMArchive
};