Support v1 directory listings in no-namespace ZIM archives

Former-commit-id: 7a845c89af339e06b60f135720d5fa7ff89e1566 [formerly 3747fcc9bc66a988158e75821d82200b46341cfc] [formerly a33b5b9586e253d1fa52d3021e910fb8060c5ad9] [formerly e782bee207ec07a5a974ff559974bca494ac84c0 [formerly 4d35f36d9c9e499c17a4ccaa9cb4e7ff05f04893 [formerly 2606c489dae01e64640896bcad93d6511f77dbf6]]] Former-commit-id: 2046bc5adc230637e777d20766f5b541d23f7b09 [formerly ecee0a4297ad0dfcdcebc990ec001007d27cafdb [formerly 117594ac0431b953d4c528cac41bc60d0bf5fffa]] Former-commit-id: 7bca7338d773a9b28beb5bbf0c6dc8ce15591331 [formerly e1285c39db7e43377d3f75ae7c5cb00d4daa1bb2] Former-commit-id: c74944f151c57050c269191e34e2f83234746560
2025-09-08 03:37:12 -04:00 · 2021-04-03 13:18:24 +01:00 · 2021-04-03 13:18:24 +01:00 · af25d663b1
commit af25d663b1
parent 2768534578
4 changed files with 164 additions and 45 deletions
--- a/www/js/lib/cache.js
+++ b/www/js/lib/cache.js
@ -444,7 +444,7 @@ define(['q', 'settingsStore', 'uiUtil'], function(Q, settingsStore, uiUtil) {
                selectedArchive.readUtf8File : selectedArchive.readBinaryFile;
            // Bypass getting dirEntry if we already have it
            var getDirEntry = dirEntry ? Q.Promise.resolve() :
-                selectedArchive.getDirEntryByTitle(title);
+                selectedArchive.getDirEntryByPath(title);
            // Read data from ZIM
            getDirEntry.then(function(resolvedDirEntry) {
                if (dirEntry) resolvedDirEntry = dirEntry;
--- a/www/js/lib/images.js
+++ b/www/js/lib/images.js
@ -84,7 +84,7 @@ define(['uiUtil'], function (uiUtil) {
                });
                return;
            }
-            appstate.selectedArchive.getDirEntryByTitle(title).then(function (dirEntry) {
+            appstate.selectedArchive.getDirEntryByPath(title).then(function (dirEntry) {
                return appstate.selectedArchive.readBinaryFile(dirEntry, function (fileDirEntry, content) {
                    image.style.background = '';
                    var mimetype = dirEntry.getMimetype();
--- a/www/js/lib/zimArchive.js
+++ b/www/js/lib/zimArchive.js
@ -46,9 +46,9 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
     * Creates a ZIM archive object to access the ZIM file at the given path in the given storage.
     * This constructor can also be used with a single File parameter.
     * 
-     * @param {StorageFirefoxOS|Array.<Blob>} storage Storage (in this case, the path must be given) or Array of Files (path parameter must be omitted)
-     * @param {String} path
-     * @param {callbackZIMArchive} callbackReady
+     * @param {StorageFirefoxOS|Array<Blob>} storage Storage (in this case, the path must be given) or Array of Files (path parameter must be omitted)
+     * @param {String} path The Storage path for an OS that requires this to be specified
+     * @param {callbackZIMArchive} callbackReady The function to call when the archive is ready to use
     */
    function ZIMArchive(storage, path, callbackReady) {
        var that = this;
@ -57,6 +57,28 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
        var createZimfile = function(fileArray) {
            zimfile.fromFileArray(fileArray).then(function(file) {
                that._file = file;
+                // File has been created, but we need to add any Listings which extend the archive metadata
+                that._file.setListings([
+                    // Provide here any Listings for which we need to extract metadata as key:value obects to be added to the file
+                    // 'ptrName' and 'countName' contain the key names to be set in the archive file object
+                    {
+                        // This defines the standard v0 (legacy) title index that contains listings for every entry in the ZIM (not just articles)
+                        // It represents the same index that is referenced in the ZIM archive header
+                        path: 'X/listing/titleOrdered/v0',
+                        ptrName: 'titlePtrPos',
+                        countName: 'entryCount'
+                    },
+                    {
+                        // This defines a new version 1 index that is present in no-namespace ZIMs, and contains a title-ordered list of articles
+                        path: 'X/listing/titleOrdered/v1',
+                        ptrName: 'articlePtrPos',
+                        countName: 'articleCount'
+                    }
+                ]);
+                // DEV: Currently, extended listings are only used for title (=article) listings when the user searches
+                // for an article or uses the Random button, by which time the listings will have been extracted.
+                // If, in the future, listings are used in a more time-critical manner, consider forcing a wait before
+                // declaring the archive to be ready, by chaining the following callback in a .then() function of setListings.
                callbackReady(that);
            });
        };
@ -251,7 +273,9 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
        prefix = prefix || '';
        var that = this;
        var cns = this.getContentNamespace();
-        util.binarySearch(startIndex, this._file.articleCount, function(i) {
+        // Search v1 article listing if available, otherwise fallback to v0
+        var articleCount = this._file.articleCount || this._file.entryCount;
+        util.binarySearch(startIndex, articleCount, function(i) {
            return that._file.dirEntryByTitleIndex(i).then(function(dirEntry) {
                if (search.status === 'cancelled') return 0;
                var ns = dirEntry.namespace;
@ -263,7 +287,7 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
        }, true).then(function(firstIndex) {
            var dirEntries = [];
            var addDirEntries = function(index) {
-                if (search.status === 'cancelled' || index >= firstIndex + resultSize || index >= that._file.articleCount) {
+                if (search.status === 'cancelled' || index >= firstIndex + resultSize || index >= articleCount) {
                    return {
                        'dirEntries': dirEntries,
                        'nextStart': index
@ -333,18 +357,18 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
    };
    
    /**
-     * Searches a DirEntry (article / page) by its title.
-     * @param {String} title
-     * @return {Promise} resolving to the DirEntry object or null if not found.
+     * Searches the URL pointer list of Directory Entries by pathname
+     * @param {String} path The pathname of the DirEntry that is required (namespace + filename)
+     * @return {Promise<DirEntry>} A Promise that resolves to a Directory Entry, or null if not found.
     */
-    ZIMArchive.prototype.getDirEntryByTitle = function(title) {
+    ZIMArchive.prototype.getDirEntryByPath = function(path) {
        var that = this;
-        return util.binarySearch(0, this._file.articleCount, function(i) {
+        return util.binarySearch(0, this._file.entryCount, function(i) {
            return that._file.dirEntryByUrlIndex(i).then(function(dirEntry) {
                var url = dirEntry.namespace + "/" + dirEntry.url;
-                if (title < url)
+                if (path < url)
                    return -1;
-                else if (title > url)
+                else if (path > url)
                    return 1;
                else
                    return 0;
@ -353,10 +377,10 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
            if (index === null) return null;
            return that._file.dirEntryByUrlIndex(index);
        }).then(function(dirEntry) {
-            if ((dirEntry === null || dirEntry === undefined) && /^[AC]\/[^/]+\/.+/i.test(title)) {
-                console.log("Article " + title + " not available, but moving up one directory to compensate for ZIM coding error...");
-                title = title.replace(/^([AC]\/)[^/]+\/(.+)$/, '$1$2');
-                return that.getDirEntryByTitle(title);
+            if ((dirEntry === null || dirEntry === undefined) && /^[AC]\/[^/]+\/.+/i.test(path)) {
+                console.log("Article " + path + " not available, but moving up one directory to compensate for ZIM coding error...");
+                path = path.replace(/^([AC]\/)[^/]+\/(.+)$/, '$1$2');
+                return that.getDirEntryByPath(path);
            } else {
                return dirEntry;
            }
@ -368,8 +392,10 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
     * @param {callbackDirEntry} callback
     */
    ZIMArchive.prototype.getRandomDirEntry = function(callback) {
-        var index = Math.floor(Math.random() * this._file.articleCount);
-        this._file.dirEntryByUrlIndex(index).then(callback);
+        // Prefer an article-only (v1) title pointer list, if available
+        var articleCount = this._file.articleCount || this._file.entryCount;
+        var index = Math.floor(Math.random() * articleCount);
+        this._file.dirEntryByTitleIndex(index).then(callback);
    };

    /**
@ -379,7 +405,7 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
     */
    ZIMArchive.prototype.getMetadata = function (key, callback) {
        var that = this;
-        this.getDirEntryByTitle("M/" + key).then(function (dirEntry) {
+        this.getDirEntryByPath("M/" + key).then(function (dirEntry) {
            if (dirEntry === null || dirEntry === undefined) {
                console.warn("Title M/" + key + " not found in the archive");
                callback();
--- a/www/js/lib/zimfile.js
+++ b/www/js/lib/zimfile.js
@ -53,14 +53,17 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
     * @property {Array<File>} _files Array of ZIM files
     * @property {String} name Abstract archive name for file set
     * @property {Integer} id Arbitrary numeric ZIM id used to track the currently loaded archive
-     * @property {Integer} articleCount Total number of articles
+     * @property {Integer} entryCount Total number of entries in the URL pointerlist
+     * @property {Integer} articleCount Total number of articles in the v1 article-only pointerlist (async calculated entry)
     * @property {Integer} clusterCount Total number of clusters
     * @property {Integer} urlPtrPos Position of the directory pointerlist ordered by URL
-     * @property {Integer} titlePtrPos Position of the directory pointerlist ordered by title
+     * @property {Integer} titlePtrPos Position of the legacy v0 pointerlist ordered by title
+     * @property {Integer} articlePtrPos Position of the v1 article-only pointerlist ordered by title (async calculated entry)
     * @property {Integer} clusterPtrPos Position of the cluster pointer list
     * @property {Integer} mimeListPos Position of the MIME type list (also header size)
     * @property {Integer} mainPage Main page or 0xffffffff if no main page
     * @property {Integer} layoutPage Layout page or 0xffffffffff if no layout page
+     * @property {Map} mimeTypes The ZIM file's MIME type table rendered as a Map (calculated entry)
     */
    
    /**
@ -119,7 +122,7 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
            return readRequests[0];
        } else {
            // Wait until all are resolved and concatenate.
-            return Q.all(readRequests).then(function(arrays) {
+            return Q.all(readRequests).then(function (arrays) {
                var concatenated = new Uint8Array(end - begin);
                var offset = 0;
                arrays.forEach(function (item) {
@ -181,7 +184,9 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
     */
    ZIMFile.prototype.dirEntryByTitleIndex = function (index) {
        var that = this;
-        return this._readInteger(this.titlePtrPos + index * 4, 4).then(function (urlIndex) {
+        // Use v1 title pointerlist if available, or fall back to legacy v0 list
+        var ptrList = this.articlePtrPos || this.titlePtrPos;
+        return this._readInteger(ptrList + index * 4, 4).then(function (urlIndex) {
            return that.dirEntryByUrlIndex(urlIndex);
        });
    };
@ -190,28 +195,44 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
     * Read and if necessary decompress a BLOB based on its cluster number and blob number
     * @param {Integer} cluster The cluster number where the blob is to be found
     * @param {Integer} blob The blob number within the cluster
+     * @param {Boolean} meta If true, and if the cluster is uncompressed, the function will return only the blob's metadata
+     *        (its archive offset and its size), otherwise return null
     * @returns {Promise<Uint8Array>} A Promise for the BLOB's data
     */
-    ZIMFile.prototype.blob = function (cluster, blob) {
+    ZIMFile.prototype.blob = function (cluster, blob, meta) {
        var that = this;
        return this._readSlice(this.clusterPtrPos + cluster * 8, 16).then(function (clusterOffsets) {
            var clusterOffset = readInt(clusterOffsets, 0, 8);
            var nextCluster = readInt(clusterOffsets, 8, 8);
            // DEV: The method below of calculating cluster size is not safe: see https://github.com/openzim/libzim/issues/84#issuecomment-612962250
            // var thisClusterLength = nextCluster - clusterOffset - 1;
-            return that._readSlice(clusterOffset, 1).then(function(compressionType) {
+            return that._readSlice(clusterOffset, 1).then(function (compressionType) {
                var decompressor;
-                var plainBlobReader = function(offset, size) {
+                var plainBlobReader = function (offset, size, dataPass) {
                    // Check that we are not reading beyond the end of the cluster
                    var offsetStart = clusterOffset + 1 + offset;
-                    if ( offsetStart < nextCluster) {
+                    if (offsetStart < nextCluster) {
                        // Gratuitous parentheses added for legibility
                        size = (offsetStart + size) <= nextCluster ? size : (nextCluster - offsetStart);
-                        return that._readSlice(offsetStart, size);
+                        // DEV: This blob reader is called twice: on the first pass it reads the cluster's blob list,
+                        // and on the second pass ("dataPass") it is ready to read the blob's data
+                        if (meta && dataPass) {
+                            // If only metadata were requested and we are on the data pass, we should now have them
+                            return {
+                                ptr: offsetStart,
+                                size: size
+                            };
+                        } else {
+                            return that._readSlice(offsetStart, size);
+                        }
                    } else {
                        return Q(new Uint8Array(0).buffer);
                    }
                };
+                // If only metadata were requested and the cluster is compressed, return null (this is probably a ZIM format error)
+                // DEV: This is because metadata are only requested for finding absolute offsets into uncompressed clusters,
+                // principally for finding the start and size of a title pointer listing
+                if (meta && compressionType[0] > 1) return null;
                if (compressionType[0] === 0 || compressionType[0] === 1) {
                    // uncompressed
                    decompressor = { readSliceSingleThread: plainBlobReader };
@ -222,21 +243,91 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
                } else {
                    return new Uint8Array(); // unsupported compression type
                }
-                return decompressor.readSliceSingleThread(blob * 4, 8).then(function(data) {
+                return decompressor.readSliceSingleThread(blob * 4, 8, false).then(function (data) {
                    var blobOffset = readInt(data, 0, 4);
                    var nextBlobOffset = readInt(data, 4, 4);
-                    return decompressor.readSliceSingleThread(blobOffset, nextBlobOffset - blobOffset);
+                    return decompressor.readSliceSingleThread(blobOffset, nextBlobOffset - blobOffset, true);
                });
            });
        });
    };

+    /**
+     * A Directory Listing object
+     * @typedef {Object} DirListing A list of pointers to directory entries (via the URL pointerlist)
+     * @property {String} path The path (url) to the directory entry for the Listing
+     * @property {String} ptrName The name of the pointer to the Listing's data that will be added to the ZIMFile obect
+     * @property {String} countName The name of the key that will contain the number of entries in the Listing, to be added to the ZIMFile object 
+     */
+
+    /**
+     * Read the metadata (archive offset pointer, and number of entiries) of one or more ZIM directory Listings.
+     * This supports reading a subset of user content that might be ordered differently from the main URL pointerlist.
+     * In particular, it supports v1 title pointerlists, which contain articles sorted by title, superseding the article
+     * namespace ('A') in legazy ZIM archives.  
+     * @param {Array<DirListing>} listings An array of DirListing objects (see zimArchive.js for examples)  
+     */
+    ZIMFile.prototype.setListings = function(listings) {
+        // If we are in a legacy ZIM archive, there is nothing further to look up
+        if (this.minorVersion === 0) {
+            console.debug('ZIM DirListing version: 0 (legacy)', this);
+            return;
+        }
+        var that = this;
+        var highestListingVersion = 0;
+        var listingAccessor = function (listing) {
+            if (!listing) {
+                // No more listings, so exit
+                console.debug('ZIM DirListing version: ' + highestListingVersion, that);
+                return null;
+            }
+            // Check if we already have this listing's values, so we don't do redundant binary searches
+            if (that[listing.ptrName] && that[listing.countName]) {
+                highestListingVersion = Math.max(~~listing.path.replace(/.+(\d)$/, '$1'), highestListingVersion);
+                // Get the next listing
+                return listingAccessor(listings.pop());
+            }
+            // Initiate a binary search for the listing URL
+            return util.binarySearch(0, that.entryCount, function(i) {
+                return that.dirEntryByUrlIndex(i).then(function(dirEntry) {
+                    var url = dirEntry.namespace + "/" + dirEntry.url;
+                    if (listing.path < url)
+                        return -1;
+                    else if (listing.path > url)
+                        return 1;
+                    else
+                        return 0;
+                });
+            }).then(function(index) {
+                if (index === null) return null;
+                return that.dirEntryByUrlIndex(index);
+            }).then(function(dirEntry) {
+                if (!dirEntry) return null;
+                // Request the metadata for the blob represented by the dirEntry
+                return that.blob(dirEntry.cluster, dirEntry.blob, true);
+            }).then(function(metadata) {
+                // Note that we do not accept a listing if its size is 0, i.e. if it contains no data
+                // (although this should not occur, we have been asked to handle it - see kiwix-js #708)
+                if (metadata && metadata.size) {
+                    that[listing.ptrName] = metadata.ptr;
+                    that[listing.countName] = metadata.size / 4; // Each entry uses 4 bytes
+                    highestListingVersion = Math.max(~~listing.path.replace(/.+(\d)$/, '$1'), highestListingVersion);
+                }
+                // Get the next Listing
+                return listingAccessor(listings.pop());
+            }).catch(function(err) {
+                console.error('There was an error accessing a Directory Listing', err);
+            });
+        };
+        listingAccessor(listings.pop());
+    };    
+
    /**
     * Reads the whole MIME type list and returns it as a populated Map
     * The mimeTypeMap is extracted once after the user has picked the ZIM file
     * and is stored as ZIMFile.mimeTypes
     * @param {File} file The ZIM file (or first file in array of files) from which the MIME type list 
-*                      is to be extracted
+     *      is to be extracted
     * @param {Integer} mimeListPos The offset in <file> at which the MIME type list is found
     * @param {Integer} urlPtrPos The offset of URL Pointer List in the archive
     * @returns {Promise} A promise for the MIME Type list as a Map
@ -254,7 +345,7 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
                var pos = -1;
                var mimeString;
                while (pos < size) {
-                    pos++; 
+                    pos++;
                    mimeString = utf8.parse(data.subarray(pos), true);
                    // If the parsed data is an empty string, we have reached the end of the MIME type list, so break 
                    if (!mimeString) break;
@ -267,30 +358,30 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
                }
            }
            return typeMap;
-        }).catch(function(err) {
+        }).catch(function (err) {
            console.error('Unable to read MIME type list', err);
            return new Map;
        });
    }
-    
+
    return {
        /**
         * @param {Array<File>} fileArray An array of picked archive files
         * @returns {Promise<Object>} A Promise for the ZimFile Object
         */
-        fromFileArray: function(fileArray) {
+        fromFileArray: function (fileArray) {
            // Array of blob objects should be sorted by their name property
-            fileArray.sort(function(a, b) {
-                  var nameA = a.name.toUpperCase(); 
-                  var nameB = b.name.toUpperCase(); 
+            fileArray.sort(function (a, b) {
+                var nameA = a.name.toUpperCase();
+                var nameB = b.name.toUpperCase();
                if (nameA < nameB) return -1;
                if (nameA > nameB) return 1;
-                  return 0;
+                return 0;
            });
-            return util.readFileSlice(fileArray[0], 0, 80).then(function(header) {
+            return util.readFileSlice(fileArray[0], 0, 80).then(function (header) {
                var mimeListPos = readInt(header, 56, 8);
                var urlPtrPos = readInt(header, 32, 8);
-                return readMimetypeMap(fileArray[0], mimeListPos, urlPtrPos).then(function (data) {
+                return readMimetypeMap(fileArray[0], mimeListPos, urlPtrPos).then(function (mapData) {
                    var zf = new ZIMFile(fileArray);
                    // Add an abstract archive name (ignoring split file extensions)
                    zf.name = fileArray[0].name.replace(/(\.zim)\w\w$/i, '$1');
@ -303,15 +394,17 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
                    // For a description of these values, see https://wiki.openzim.org/wiki/ZIM_file_format
                    zf.majorVersion = readInt(header, 4, 2); // Not currently used by this implementation
                    zf.minorVersion = readInt(header, 6, 2); // Used to determine the User Content namespace
-                    zf.articleCount = readInt(header, 24, 4);
+                    zf.entryCount = readInt(header, 24, 4);
+                    zf.articleCount = null; // Calculated async by setListings() called from zimArchive.js 
                    zf.clusterCount = readInt(header, 28, 4);
                    zf.urlPtrPos = urlPtrPos;
                    zf.titlePtrPos = readInt(header, 40, 8);
+                    zf.articlePtrPos = null; // Calculated async by setListings() called from zimArchive.js 
                    zf.clusterPtrPos = readInt(header, 48, 8);
                    zf.mimeListPos = mimeListPos;
                    zf.mainPage = readInt(header, 64, 4);
                    zf.layoutPage = readInt(header, 68, 4);
-                    zf.mimeTypes = data;
+                    zf.mimeTypes = mapData;
                    return zf;
                });
            });