diff --git a/www/js/lib/cache.js b/www/js/lib/cache.js index 5d1be7fe..6c51a102 100644 --- a/www/js/lib/cache.js +++ b/www/js/lib/cache.js @@ -444,7 +444,7 @@ define(['q', 'settingsStore', 'uiUtil'], function(Q, settingsStore, uiUtil) { selectedArchive.readUtf8File : selectedArchive.readBinaryFile; // Bypass getting dirEntry if we already have it var getDirEntry = dirEntry ? Q.Promise.resolve() : - selectedArchive.getDirEntryByTitle(title); + selectedArchive.getDirEntryByPath(title); // Read data from ZIM getDirEntry.then(function(resolvedDirEntry) { if (dirEntry) resolvedDirEntry = dirEntry; diff --git a/www/js/lib/images.js b/www/js/lib/images.js index d1b46285..39e7633b 100644 --- a/www/js/lib/images.js +++ b/www/js/lib/images.js @@ -84,7 +84,7 @@ define(['uiUtil'], function (uiUtil) { }); return; } - appstate.selectedArchive.getDirEntryByTitle(title).then(function (dirEntry) { + appstate.selectedArchive.getDirEntryByPath(title).then(function (dirEntry) { return appstate.selectedArchive.readBinaryFile(dirEntry, function (fileDirEntry, content) { image.style.background = ''; var mimetype = dirEntry.getMimetype(); diff --git a/www/js/lib/zimArchive.js b/www/js/lib/zimArchive.js index bff83af2..732a4739 100644 --- a/www/js/lib/zimArchive.js +++ b/www/js/lib/zimArchive.js @@ -46,9 +46,9 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'], * Creates a ZIM archive object to access the ZIM file at the given path in the given storage. * This constructor can also be used with a single File parameter. * - * @param {StorageFirefoxOS|Array.} storage Storage (in this case, the path must be given) or Array of Files (path parameter must be omitted) - * @param {String} path - * @param {callbackZIMArchive} callbackReady + * @param {StorageFirefoxOS|Array} storage Storage (in this case, the path must be given) or Array of Files (path parameter must be omitted) + * @param {String} path The Storage path for an OS that requires this to be specified + * @param {callbackZIMArchive} callbackReady The function to call when the archive is ready to use */ function ZIMArchive(storage, path, callbackReady) { var that = this; @@ -57,6 +57,28 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'], var createZimfile = function(fileArray) { zimfile.fromFileArray(fileArray).then(function(file) { that._file = file; + // File has been created, but we need to add any Listings which extend the archive metadata + that._file.setListings([ + // Provide here any Listings for which we need to extract metadata as key:value obects to be added to the file + // 'ptrName' and 'countName' contain the key names to be set in the archive file object + { + // This defines the standard v0 (legacy) title index that contains listings for every entry in the ZIM (not just articles) + // It represents the same index that is referenced in the ZIM archive header + path: 'X/listing/titleOrdered/v0', + ptrName: 'titlePtrPos', + countName: 'entryCount' + }, + { + // This defines a new version 1 index that is present in no-namespace ZIMs, and contains a title-ordered list of articles + path: 'X/listing/titleOrdered/v1', + ptrName: 'articlePtrPos', + countName: 'articleCount' + } + ]); + // DEV: Currently, extended listings are only used for title (=article) listings when the user searches + // for an article or uses the Random button, by which time the listings will have been extracted. + // If, in the future, listings are used in a more time-critical manner, consider forcing a wait before + // declaring the archive to be ready, by chaining the following callback in a .then() function of setListings. callbackReady(that); }); }; @@ -251,7 +273,9 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'], prefix = prefix || ''; var that = this; var cns = this.getContentNamespace(); - util.binarySearch(startIndex, this._file.articleCount, function(i) { + // Search v1 article listing if available, otherwise fallback to v0 + var articleCount = this._file.articleCount || this._file.entryCount; + util.binarySearch(startIndex, articleCount, function(i) { return that._file.dirEntryByTitleIndex(i).then(function(dirEntry) { if (search.status === 'cancelled') return 0; var ns = dirEntry.namespace; @@ -263,7 +287,7 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'], }, true).then(function(firstIndex) { var dirEntries = []; var addDirEntries = function(index) { - if (search.status === 'cancelled' || index >= firstIndex + resultSize || index >= that._file.articleCount) { + if (search.status === 'cancelled' || index >= firstIndex + resultSize || index >= articleCount) { return { 'dirEntries': dirEntries, 'nextStart': index @@ -333,18 +357,18 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'], }; /** - * Searches a DirEntry (article / page) by its title. - * @param {String} title - * @return {Promise} resolving to the DirEntry object or null if not found. + * Searches the URL pointer list of Directory Entries by pathname + * @param {String} path The pathname of the DirEntry that is required (namespace + filename) + * @return {Promise} A Promise that resolves to a Directory Entry, or null if not found. */ - ZIMArchive.prototype.getDirEntryByTitle = function(title) { + ZIMArchive.prototype.getDirEntryByPath = function(path) { var that = this; - return util.binarySearch(0, this._file.articleCount, function(i) { + return util.binarySearch(0, this._file.entryCount, function(i) { return that._file.dirEntryByUrlIndex(i).then(function(dirEntry) { var url = dirEntry.namespace + "/" + dirEntry.url; - if (title < url) + if (path < url) return -1; - else if (title > url) + else if (path > url) return 1; else return 0; @@ -353,10 +377,10 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'], if (index === null) return null; return that._file.dirEntryByUrlIndex(index); }).then(function(dirEntry) { - if ((dirEntry === null || dirEntry === undefined) && /^[AC]\/[^/]+\/.+/i.test(title)) { - console.log("Article " + title + " not available, but moving up one directory to compensate for ZIM coding error..."); - title = title.replace(/^([AC]\/)[^/]+\/(.+)$/, '$1$2'); - return that.getDirEntryByTitle(title); + if ((dirEntry === null || dirEntry === undefined) && /^[AC]\/[^/]+\/.+/i.test(path)) { + console.log("Article " + path + " not available, but moving up one directory to compensate for ZIM coding error..."); + path = path.replace(/^([AC]\/)[^/]+\/(.+)$/, '$1$2'); + return that.getDirEntryByPath(path); } else { return dirEntry; } @@ -368,8 +392,10 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'], * @param {callbackDirEntry} callback */ ZIMArchive.prototype.getRandomDirEntry = function(callback) { - var index = Math.floor(Math.random() * this._file.articleCount); - this._file.dirEntryByUrlIndex(index).then(callback); + // Prefer an article-only (v1) title pointer list, if available + var articleCount = this._file.articleCount || this._file.entryCount; + var index = Math.floor(Math.random() * articleCount); + this._file.dirEntryByTitleIndex(index).then(callback); }; /** @@ -379,7 +405,7 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'], */ ZIMArchive.prototype.getMetadata = function (key, callback) { var that = this; - this.getDirEntryByTitle("M/" + key).then(function (dirEntry) { + this.getDirEntryByPath("M/" + key).then(function (dirEntry) { if (dirEntry === null || dirEntry === undefined) { console.warn("Title M/" + key + " not found in the archive"); callback(); diff --git a/www/js/lib/zimfile.js b/www/js/lib/zimfile.js index 61bdecec..71d83f9b 100644 --- a/www/js/lib/zimfile.js +++ b/www/js/lib/zimfile.js @@ -53,14 +53,17 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry', * @property {Array} _files Array of ZIM files * @property {String} name Abstract archive name for file set * @property {Integer} id Arbitrary numeric ZIM id used to track the currently loaded archive - * @property {Integer} articleCount Total number of articles + * @property {Integer} entryCount Total number of entries in the URL pointerlist + * @property {Integer} articleCount Total number of articles in the v1 article-only pointerlist (async calculated entry) * @property {Integer} clusterCount Total number of clusters * @property {Integer} urlPtrPos Position of the directory pointerlist ordered by URL - * @property {Integer} titlePtrPos Position of the directory pointerlist ordered by title + * @property {Integer} titlePtrPos Position of the legacy v0 pointerlist ordered by title + * @property {Integer} articlePtrPos Position of the v1 article-only pointerlist ordered by title (async calculated entry) * @property {Integer} clusterPtrPos Position of the cluster pointer list * @property {Integer} mimeListPos Position of the MIME type list (also header size) * @property {Integer} mainPage Main page or 0xffffffff if no main page * @property {Integer} layoutPage Layout page or 0xffffffffff if no layout page + * @property {Map} mimeTypes The ZIM file's MIME type table rendered as a Map (calculated entry) */ /** @@ -119,7 +122,7 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry', return readRequests[0]; } else { // Wait until all are resolved and concatenate. - return Q.all(readRequests).then(function(arrays) { + return Q.all(readRequests).then(function (arrays) { var concatenated = new Uint8Array(end - begin); var offset = 0; arrays.forEach(function (item) { @@ -181,7 +184,9 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry', */ ZIMFile.prototype.dirEntryByTitleIndex = function (index) { var that = this; - return this._readInteger(this.titlePtrPos + index * 4, 4).then(function (urlIndex) { + // Use v1 title pointerlist if available, or fall back to legacy v0 list + var ptrList = this.articlePtrPos || this.titlePtrPos; + return this._readInteger(ptrList + index * 4, 4).then(function (urlIndex) { return that.dirEntryByUrlIndex(urlIndex); }); }; @@ -190,28 +195,44 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry', * Read and if necessary decompress a BLOB based on its cluster number and blob number * @param {Integer} cluster The cluster number where the blob is to be found * @param {Integer} blob The blob number within the cluster + * @param {Boolean} meta If true, and if the cluster is uncompressed, the function will return only the blob's metadata + * (its archive offset and its size), otherwise return null * @returns {Promise} A Promise for the BLOB's data */ - ZIMFile.prototype.blob = function (cluster, blob) { + ZIMFile.prototype.blob = function (cluster, blob, meta) { var that = this; return this._readSlice(this.clusterPtrPos + cluster * 8, 16).then(function (clusterOffsets) { var clusterOffset = readInt(clusterOffsets, 0, 8); var nextCluster = readInt(clusterOffsets, 8, 8); // DEV: The method below of calculating cluster size is not safe: see https://github.com/openzim/libzim/issues/84#issuecomment-612962250 // var thisClusterLength = nextCluster - clusterOffset - 1; - return that._readSlice(clusterOffset, 1).then(function(compressionType) { + return that._readSlice(clusterOffset, 1).then(function (compressionType) { var decompressor; - var plainBlobReader = function(offset, size) { + var plainBlobReader = function (offset, size, dataPass) { // Check that we are not reading beyond the end of the cluster var offsetStart = clusterOffset + 1 + offset; - if ( offsetStart < nextCluster) { + if (offsetStart < nextCluster) { // Gratuitous parentheses added for legibility size = (offsetStart + size) <= nextCluster ? size : (nextCluster - offsetStart); - return that._readSlice(offsetStart, size); + // DEV: This blob reader is called twice: on the first pass it reads the cluster's blob list, + // and on the second pass ("dataPass") it is ready to read the blob's data + if (meta && dataPass) { + // If only metadata were requested and we are on the data pass, we should now have them + return { + ptr: offsetStart, + size: size + }; + } else { + return that._readSlice(offsetStart, size); + } } else { return Q(new Uint8Array(0).buffer); } }; + // If only metadata were requested and the cluster is compressed, return null (this is probably a ZIM format error) + // DEV: This is because metadata are only requested for finding absolute offsets into uncompressed clusters, + // principally for finding the start and size of a title pointer listing + if (meta && compressionType[0] > 1) return null; if (compressionType[0] === 0 || compressionType[0] === 1) { // uncompressed decompressor = { readSliceSingleThread: plainBlobReader }; @@ -222,21 +243,91 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry', } else { return new Uint8Array(); // unsupported compression type } - return decompressor.readSliceSingleThread(blob * 4, 8).then(function(data) { + return decompressor.readSliceSingleThread(blob * 4, 8, false).then(function (data) { var blobOffset = readInt(data, 0, 4); var nextBlobOffset = readInt(data, 4, 4); - return decompressor.readSliceSingleThread(blobOffset, nextBlobOffset - blobOffset); + return decompressor.readSliceSingleThread(blobOffset, nextBlobOffset - blobOffset, true); }); }); }); }; + /** + * A Directory Listing object + * @typedef {Object} DirListing A list of pointers to directory entries (via the URL pointerlist) + * @property {String} path The path (url) to the directory entry for the Listing + * @property {String} ptrName The name of the pointer to the Listing's data that will be added to the ZIMFile obect + * @property {String} countName The name of the key that will contain the number of entries in the Listing, to be added to the ZIMFile object + */ + + /** + * Read the metadata (archive offset pointer, and number of entiries) of one or more ZIM directory Listings. + * This supports reading a subset of user content that might be ordered differently from the main URL pointerlist. + * In particular, it supports v1 title pointerlists, which contain articles sorted by title, superseding the article + * namespace ('A') in legazy ZIM archives. + * @param {Array} listings An array of DirListing objects (see zimArchive.js for examples) + */ + ZIMFile.prototype.setListings = function(listings) { + // If we are in a legacy ZIM archive, there is nothing further to look up + if (this.minorVersion === 0) { + console.debug('ZIM DirListing version: 0 (legacy)', this); + return; + } + var that = this; + var highestListingVersion = 0; + var listingAccessor = function (listing) { + if (!listing) { + // No more listings, so exit + console.debug('ZIM DirListing version: ' + highestListingVersion, that); + return null; + } + // Check if we already have this listing's values, so we don't do redundant binary searches + if (that[listing.ptrName] && that[listing.countName]) { + highestListingVersion = Math.max(~~listing.path.replace(/.+(\d)$/, '$1'), highestListingVersion); + // Get the next listing + return listingAccessor(listings.pop()); + } + // Initiate a binary search for the listing URL + return util.binarySearch(0, that.entryCount, function(i) { + return that.dirEntryByUrlIndex(i).then(function(dirEntry) { + var url = dirEntry.namespace + "/" + dirEntry.url; + if (listing.path < url) + return -1; + else if (listing.path > url) + return 1; + else + return 0; + }); + }).then(function(index) { + if (index === null) return null; + return that.dirEntryByUrlIndex(index); + }).then(function(dirEntry) { + if (!dirEntry) return null; + // Request the metadata for the blob represented by the dirEntry + return that.blob(dirEntry.cluster, dirEntry.blob, true); + }).then(function(metadata) { + // Note that we do not accept a listing if its size is 0, i.e. if it contains no data + // (although this should not occur, we have been asked to handle it - see kiwix-js #708) + if (metadata && metadata.size) { + that[listing.ptrName] = metadata.ptr; + that[listing.countName] = metadata.size / 4; // Each entry uses 4 bytes + highestListingVersion = Math.max(~~listing.path.replace(/.+(\d)$/, '$1'), highestListingVersion); + } + // Get the next Listing + return listingAccessor(listings.pop()); + }).catch(function(err) { + console.error('There was an error accessing a Directory Listing', err); + }); + }; + listingAccessor(listings.pop()); + }; + /** * Reads the whole MIME type list and returns it as a populated Map * The mimeTypeMap is extracted once after the user has picked the ZIM file * and is stored as ZIMFile.mimeTypes * @param {File} file The ZIM file (or first file in array of files) from which the MIME type list -* is to be extracted + * is to be extracted * @param {Integer} mimeListPos The offset in at which the MIME type list is found * @param {Integer} urlPtrPos The offset of URL Pointer List in the archive * @returns {Promise} A promise for the MIME Type list as a Map @@ -254,7 +345,7 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry', var pos = -1; var mimeString; while (pos < size) { - pos++; + pos++; mimeString = utf8.parse(data.subarray(pos), true); // If the parsed data is an empty string, we have reached the end of the MIME type list, so break if (!mimeString) break; @@ -267,30 +358,30 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry', } } return typeMap; - }).catch(function(err) { + }).catch(function (err) { console.error('Unable to read MIME type list', err); return new Map; }); } - + return { /** * @param {Array} fileArray An array of picked archive files * @returns {Promise} A Promise for the ZimFile Object */ - fromFileArray: function(fileArray) { + fromFileArray: function (fileArray) { // Array of blob objects should be sorted by their name property - fileArray.sort(function(a, b) { - var nameA = a.name.toUpperCase(); - var nameB = b.name.toUpperCase(); + fileArray.sort(function (a, b) { + var nameA = a.name.toUpperCase(); + var nameB = b.name.toUpperCase(); if (nameA < nameB) return -1; if (nameA > nameB) return 1; - return 0; + return 0; }); - return util.readFileSlice(fileArray[0], 0, 80).then(function(header) { + return util.readFileSlice(fileArray[0], 0, 80).then(function (header) { var mimeListPos = readInt(header, 56, 8); var urlPtrPos = readInt(header, 32, 8); - return readMimetypeMap(fileArray[0], mimeListPos, urlPtrPos).then(function (data) { + return readMimetypeMap(fileArray[0], mimeListPos, urlPtrPos).then(function (mapData) { var zf = new ZIMFile(fileArray); // Add an abstract archive name (ignoring split file extensions) zf.name = fileArray[0].name.replace(/(\.zim)\w\w$/i, '$1'); @@ -303,15 +394,17 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry', // For a description of these values, see https://wiki.openzim.org/wiki/ZIM_file_format zf.majorVersion = readInt(header, 4, 2); // Not currently used by this implementation zf.minorVersion = readInt(header, 6, 2); // Used to determine the User Content namespace - zf.articleCount = readInt(header, 24, 4); + zf.entryCount = readInt(header, 24, 4); + zf.articleCount = null; // Calculated async by setListings() called from zimArchive.js zf.clusterCount = readInt(header, 28, 4); zf.urlPtrPos = urlPtrPos; zf.titlePtrPos = readInt(header, 40, 8); + zf.articlePtrPos = null; // Calculated async by setListings() called from zimArchive.js zf.clusterPtrPos = readInt(header, 48, 8); zf.mimeListPos = mimeListPos; zf.mainPage = readInt(header, 64, 4); zf.layoutPage = readInt(header, 68, 4); - zf.mimeTypes = data; + zf.mimeTypes = mapData; return zf; }); });