Support v1 directory listings in no-namespace ZIM archives

Former-commit-id: 7a845c89af339e06b60f135720d5fa7ff89e1566 [formerly 3747fcc9bc66a988158e75821d82200b46341cfc] [formerly a33b5b9586e253d1fa52d3021e910fb8060c5ad9] [formerly e782bee207ec07a5a974ff559974bca494ac84c0 [formerly 4d35f36d9c9e499c17a4ccaa9cb4e7ff05f04893 [formerly 2606c489dae01e64640896bcad93d6511f77dbf6]]]
Former-commit-id: 2046bc5adc230637e777d20766f5b541d23f7b09 [formerly ecee0a4297ad0dfcdcebc990ec001007d27cafdb [formerly 117594ac0431b953d4c528cac41bc60d0bf5fffa]]
Former-commit-id: 7bca7338d773a9b28beb5bbf0c6dc8ce15591331 [formerly e1285c39db7e43377d3f75ae7c5cb00d4daa1bb2]
Former-commit-id: c74944f151c57050c269191e34e2f83234746560
This commit is contained in:
Jaifroid 2021-04-03 13:18:24 +01:00
parent 2768534578
commit af25d663b1
4 changed files with 164 additions and 45 deletions

View File

@ -444,7 +444,7 @@ define(['q', 'settingsStore', 'uiUtil'], function(Q, settingsStore, uiUtil) {
selectedArchive.readUtf8File : selectedArchive.readBinaryFile;
// Bypass getting dirEntry if we already have it
var getDirEntry = dirEntry ? Q.Promise.resolve() :
selectedArchive.getDirEntryByTitle(title);
selectedArchive.getDirEntryByPath(title);
// Read data from ZIM
getDirEntry.then(function(resolvedDirEntry) {
if (dirEntry) resolvedDirEntry = dirEntry;

View File

@ -84,7 +84,7 @@ define(['uiUtil'], function (uiUtil) {
});
return;
}
appstate.selectedArchive.getDirEntryByTitle(title).then(function (dirEntry) {
appstate.selectedArchive.getDirEntryByPath(title).then(function (dirEntry) {
return appstate.selectedArchive.readBinaryFile(dirEntry, function (fileDirEntry, content) {
image.style.background = '';
var mimetype = dirEntry.getMimetype();

View File

@ -46,9 +46,9 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
* Creates a ZIM archive object to access the ZIM file at the given path in the given storage.
* This constructor can also be used with a single File parameter.
*
* @param {StorageFirefoxOS|Array.<Blob>} storage Storage (in this case, the path must be given) or Array of Files (path parameter must be omitted)
* @param {String} path
* @param {callbackZIMArchive} callbackReady
* @param {StorageFirefoxOS|Array<Blob>} storage Storage (in this case, the path must be given) or Array of Files (path parameter must be omitted)
* @param {String} path The Storage path for an OS that requires this to be specified
* @param {callbackZIMArchive} callbackReady The function to call when the archive is ready to use
*/
function ZIMArchive(storage, path, callbackReady) {
var that = this;
@ -57,6 +57,28 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
var createZimfile = function(fileArray) {
zimfile.fromFileArray(fileArray).then(function(file) {
that._file = file;
// File has been created, but we need to add any Listings which extend the archive metadata
that._file.setListings([
// Provide here any Listings for which we need to extract metadata as key:value obects to be added to the file
// 'ptrName' and 'countName' contain the key names to be set in the archive file object
{
// This defines the standard v0 (legacy) title index that contains listings for every entry in the ZIM (not just articles)
// It represents the same index that is referenced in the ZIM archive header
path: 'X/listing/titleOrdered/v0',
ptrName: 'titlePtrPos',
countName: 'entryCount'
},
{
// This defines a new version 1 index that is present in no-namespace ZIMs, and contains a title-ordered list of articles
path: 'X/listing/titleOrdered/v1',
ptrName: 'articlePtrPos',
countName: 'articleCount'
}
]);
// DEV: Currently, extended listings are only used for title (=article) listings when the user searches
// for an article or uses the Random button, by which time the listings will have been extracted.
// If, in the future, listings are used in a more time-critical manner, consider forcing a wait before
// declaring the archive to be ready, by chaining the following callback in a .then() function of setListings.
callbackReady(that);
});
};
@ -251,7 +273,9 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
prefix = prefix || '';
var that = this;
var cns = this.getContentNamespace();
util.binarySearch(startIndex, this._file.articleCount, function(i) {
// Search v1 article listing if available, otherwise fallback to v0
var articleCount = this._file.articleCount || this._file.entryCount;
util.binarySearch(startIndex, articleCount, function(i) {
return that._file.dirEntryByTitleIndex(i).then(function(dirEntry) {
if (search.status === 'cancelled') return 0;
var ns = dirEntry.namespace;
@ -263,7 +287,7 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
}, true).then(function(firstIndex) {
var dirEntries = [];
var addDirEntries = function(index) {
if (search.status === 'cancelled' || index >= firstIndex + resultSize || index >= that._file.articleCount) {
if (search.status === 'cancelled' || index >= firstIndex + resultSize || index >= articleCount) {
return {
'dirEntries': dirEntries,
'nextStart': index
@ -333,18 +357,18 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
};
/**
* Searches a DirEntry (article / page) by its title.
* @param {String} title
* @return {Promise} resolving to the DirEntry object or null if not found.
* Searches the URL pointer list of Directory Entries by pathname
* @param {String} path The pathname of the DirEntry that is required (namespace + filename)
* @return {Promise<DirEntry>} A Promise that resolves to a Directory Entry, or null if not found.
*/
ZIMArchive.prototype.getDirEntryByTitle = function(title) {
ZIMArchive.prototype.getDirEntryByPath = function(path) {
var that = this;
return util.binarySearch(0, this._file.articleCount, function(i) {
return util.binarySearch(0, this._file.entryCount, function(i) {
return that._file.dirEntryByUrlIndex(i).then(function(dirEntry) {
var url = dirEntry.namespace + "/" + dirEntry.url;
if (title < url)
if (path < url)
return -1;
else if (title > url)
else if (path > url)
return 1;
else
return 0;
@ -353,10 +377,10 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
if (index === null) return null;
return that._file.dirEntryByUrlIndex(index);
}).then(function(dirEntry) {
if ((dirEntry === null || dirEntry === undefined) && /^[AC]\/[^/]+\/.+/i.test(title)) {
console.log("Article " + title + " not available, but moving up one directory to compensate for ZIM coding error...");
title = title.replace(/^([AC]\/)[^/]+\/(.+)$/, '$1$2');
return that.getDirEntryByTitle(title);
if ((dirEntry === null || dirEntry === undefined) && /^[AC]\/[^/]+\/.+/i.test(path)) {
console.log("Article " + path + " not available, but moving up one directory to compensate for ZIM coding error...");
path = path.replace(/^([AC]\/)[^/]+\/(.+)$/, '$1$2');
return that.getDirEntryByPath(path);
} else {
return dirEntry;
}
@ -368,8 +392,10 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
* @param {callbackDirEntry} callback
*/
ZIMArchive.prototype.getRandomDirEntry = function(callback) {
var index = Math.floor(Math.random() * this._file.articleCount);
this._file.dirEntryByUrlIndex(index).then(callback);
// Prefer an article-only (v1) title pointer list, if available
var articleCount = this._file.articleCount || this._file.entryCount;
var index = Math.floor(Math.random() * articleCount);
this._file.dirEntryByTitleIndex(index).then(callback);
};
/**
@ -379,7 +405,7 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
*/
ZIMArchive.prototype.getMetadata = function (key, callback) {
var that = this;
this.getDirEntryByTitle("M/" + key).then(function (dirEntry) {
this.getDirEntryByPath("M/" + key).then(function (dirEntry) {
if (dirEntry === null || dirEntry === undefined) {
console.warn("Title M/" + key + " not found in the archive");
callback();

View File

@ -53,14 +53,17 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
* @property {Array<File>} _files Array of ZIM files
* @property {String} name Abstract archive name for file set
* @property {Integer} id Arbitrary numeric ZIM id used to track the currently loaded archive
* @property {Integer} articleCount Total number of articles
* @property {Integer} entryCount Total number of entries in the URL pointerlist
* @property {Integer} articleCount Total number of articles in the v1 article-only pointerlist (async calculated entry)
* @property {Integer} clusterCount Total number of clusters
* @property {Integer} urlPtrPos Position of the directory pointerlist ordered by URL
* @property {Integer} titlePtrPos Position of the directory pointerlist ordered by title
* @property {Integer} titlePtrPos Position of the legacy v0 pointerlist ordered by title
* @property {Integer} articlePtrPos Position of the v1 article-only pointerlist ordered by title (async calculated entry)
* @property {Integer} clusterPtrPos Position of the cluster pointer list
* @property {Integer} mimeListPos Position of the MIME type list (also header size)
* @property {Integer} mainPage Main page or 0xffffffff if no main page
* @property {Integer} layoutPage Layout page or 0xffffffffff if no layout page
* @property {Map} mimeTypes The ZIM file's MIME type table rendered as a Map (calculated entry)
*/
/**
@ -119,7 +122,7 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
return readRequests[0];
} else {
// Wait until all are resolved and concatenate.
return Q.all(readRequests).then(function(arrays) {
return Q.all(readRequests).then(function (arrays) {
var concatenated = new Uint8Array(end - begin);
var offset = 0;
arrays.forEach(function (item) {
@ -181,7 +184,9 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
*/
ZIMFile.prototype.dirEntryByTitleIndex = function (index) {
var that = this;
return this._readInteger(this.titlePtrPos + index * 4, 4).then(function (urlIndex) {
// Use v1 title pointerlist if available, or fall back to legacy v0 list
var ptrList = this.articlePtrPos || this.titlePtrPos;
return this._readInteger(ptrList + index * 4, 4).then(function (urlIndex) {
return that.dirEntryByUrlIndex(urlIndex);
});
};
@ -190,28 +195,44 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
* Read and if necessary decompress a BLOB based on its cluster number and blob number
* @param {Integer} cluster The cluster number where the blob is to be found
* @param {Integer} blob The blob number within the cluster
* @param {Boolean} meta If true, and if the cluster is uncompressed, the function will return only the blob's metadata
* (its archive offset and its size), otherwise return null
* @returns {Promise<Uint8Array>} A Promise for the BLOB's data
*/
ZIMFile.prototype.blob = function (cluster, blob) {
ZIMFile.prototype.blob = function (cluster, blob, meta) {
var that = this;
return this._readSlice(this.clusterPtrPos + cluster * 8, 16).then(function (clusterOffsets) {
var clusterOffset = readInt(clusterOffsets, 0, 8);
var nextCluster = readInt(clusterOffsets, 8, 8);
// DEV: The method below of calculating cluster size is not safe: see https://github.com/openzim/libzim/issues/84#issuecomment-612962250
// var thisClusterLength = nextCluster - clusterOffset - 1;
return that._readSlice(clusterOffset, 1).then(function(compressionType) {
return that._readSlice(clusterOffset, 1).then(function (compressionType) {
var decompressor;
var plainBlobReader = function(offset, size) {
var plainBlobReader = function (offset, size, dataPass) {
// Check that we are not reading beyond the end of the cluster
var offsetStart = clusterOffset + 1 + offset;
if ( offsetStart < nextCluster) {
if (offsetStart < nextCluster) {
// Gratuitous parentheses added for legibility
size = (offsetStart + size) <= nextCluster ? size : (nextCluster - offsetStart);
return that._readSlice(offsetStart, size);
// DEV: This blob reader is called twice: on the first pass it reads the cluster's blob list,
// and on the second pass ("dataPass") it is ready to read the blob's data
if (meta && dataPass) {
// If only metadata were requested and we are on the data pass, we should now have them
return {
ptr: offsetStart,
size: size
};
} else {
return that._readSlice(offsetStart, size);
}
} else {
return Q(new Uint8Array(0).buffer);
}
};
// If only metadata were requested and the cluster is compressed, return null (this is probably a ZIM format error)
// DEV: This is because metadata are only requested for finding absolute offsets into uncompressed clusters,
// principally for finding the start and size of a title pointer listing
if (meta && compressionType[0] > 1) return null;
if (compressionType[0] === 0 || compressionType[0] === 1) {
// uncompressed
decompressor = { readSliceSingleThread: plainBlobReader };
@ -222,21 +243,91 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
} else {
return new Uint8Array(); // unsupported compression type
}
return decompressor.readSliceSingleThread(blob * 4, 8).then(function(data) {
return decompressor.readSliceSingleThread(blob * 4, 8, false).then(function (data) {
var blobOffset = readInt(data, 0, 4);
var nextBlobOffset = readInt(data, 4, 4);
return decompressor.readSliceSingleThread(blobOffset, nextBlobOffset - blobOffset);
return decompressor.readSliceSingleThread(blobOffset, nextBlobOffset - blobOffset, true);
});
});
});
};
/**
* A Directory Listing object
* @typedef {Object} DirListing A list of pointers to directory entries (via the URL pointerlist)
* @property {String} path The path (url) to the directory entry for the Listing
* @property {String} ptrName The name of the pointer to the Listing's data that will be added to the ZIMFile obect
* @property {String} countName The name of the key that will contain the number of entries in the Listing, to be added to the ZIMFile object
*/
/**
* Read the metadata (archive offset pointer, and number of entiries) of one or more ZIM directory Listings.
* This supports reading a subset of user content that might be ordered differently from the main URL pointerlist.
* In particular, it supports v1 title pointerlists, which contain articles sorted by title, superseding the article
* namespace ('A') in legazy ZIM archives.
* @param {Array<DirListing>} listings An array of DirListing objects (see zimArchive.js for examples)
*/
ZIMFile.prototype.setListings = function(listings) {
// If we are in a legacy ZIM archive, there is nothing further to look up
if (this.minorVersion === 0) {
console.debug('ZIM DirListing version: 0 (legacy)', this);
return;
}
var that = this;
var highestListingVersion = 0;
var listingAccessor = function (listing) {
if (!listing) {
// No more listings, so exit
console.debug('ZIM DirListing version: ' + highestListingVersion, that);
return null;
}
// Check if we already have this listing's values, so we don't do redundant binary searches
if (that[listing.ptrName] && that[listing.countName]) {
highestListingVersion = Math.max(~~listing.path.replace(/.+(\d)$/, '$1'), highestListingVersion);
// Get the next listing
return listingAccessor(listings.pop());
}
// Initiate a binary search for the listing URL
return util.binarySearch(0, that.entryCount, function(i) {
return that.dirEntryByUrlIndex(i).then(function(dirEntry) {
var url = dirEntry.namespace + "/" + dirEntry.url;
if (listing.path < url)
return -1;
else if (listing.path > url)
return 1;
else
return 0;
});
}).then(function(index) {
if (index === null) return null;
return that.dirEntryByUrlIndex(index);
}).then(function(dirEntry) {
if (!dirEntry) return null;
// Request the metadata for the blob represented by the dirEntry
return that.blob(dirEntry.cluster, dirEntry.blob, true);
}).then(function(metadata) {
// Note that we do not accept a listing if its size is 0, i.e. if it contains no data
// (although this should not occur, we have been asked to handle it - see kiwix-js #708)
if (metadata && metadata.size) {
that[listing.ptrName] = metadata.ptr;
that[listing.countName] = metadata.size / 4; // Each entry uses 4 bytes
highestListingVersion = Math.max(~~listing.path.replace(/.+(\d)$/, '$1'), highestListingVersion);
}
// Get the next Listing
return listingAccessor(listings.pop());
}).catch(function(err) {
console.error('There was an error accessing a Directory Listing', err);
});
};
listingAccessor(listings.pop());
};
/**
* Reads the whole MIME type list and returns it as a populated Map
* The mimeTypeMap is extracted once after the user has picked the ZIM file
* and is stored as ZIMFile.mimeTypes
* @param {File} file The ZIM file (or first file in array of files) from which the MIME type list
* is to be extracted
* is to be extracted
* @param {Integer} mimeListPos The offset in <file> at which the MIME type list is found
* @param {Integer} urlPtrPos The offset of URL Pointer List in the archive
* @returns {Promise} A promise for the MIME Type list as a Map
@ -254,7 +345,7 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
var pos = -1;
var mimeString;
while (pos < size) {
pos++;
pos++;
mimeString = utf8.parse(data.subarray(pos), true);
// If the parsed data is an empty string, we have reached the end of the MIME type list, so break
if (!mimeString) break;
@ -267,30 +358,30 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
}
}
return typeMap;
}).catch(function(err) {
}).catch(function (err) {
console.error('Unable to read MIME type list', err);
return new Map;
});
}
return {
/**
* @param {Array<File>} fileArray An array of picked archive files
* @returns {Promise<Object>} A Promise for the ZimFile Object
*/
fromFileArray: function(fileArray) {
fromFileArray: function (fileArray) {
// Array of blob objects should be sorted by their name property
fileArray.sort(function(a, b) {
var nameA = a.name.toUpperCase();
var nameB = b.name.toUpperCase();
fileArray.sort(function (a, b) {
var nameA = a.name.toUpperCase();
var nameB = b.name.toUpperCase();
if (nameA < nameB) return -1;
if (nameA > nameB) return 1;
return 0;
return 0;
});
return util.readFileSlice(fileArray[0], 0, 80).then(function(header) {
return util.readFileSlice(fileArray[0], 0, 80).then(function (header) {
var mimeListPos = readInt(header, 56, 8);
var urlPtrPos = readInt(header, 32, 8);
return readMimetypeMap(fileArray[0], mimeListPos, urlPtrPos).then(function (data) {
return readMimetypeMap(fileArray[0], mimeListPos, urlPtrPos).then(function (mapData) {
var zf = new ZIMFile(fileArray);
// Add an abstract archive name (ignoring split file extensions)
zf.name = fileArray[0].name.replace(/(\.zim)\w\w$/i, '$1');
@ -303,15 +394,17 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
// For a description of these values, see https://wiki.openzim.org/wiki/ZIM_file_format
zf.majorVersion = readInt(header, 4, 2); // Not currently used by this implementation
zf.minorVersion = readInt(header, 6, 2); // Used to determine the User Content namespace
zf.articleCount = readInt(header, 24, 4);
zf.entryCount = readInt(header, 24, 4);
zf.articleCount = null; // Calculated async by setListings() called from zimArchive.js
zf.clusterCount = readInt(header, 28, 4);
zf.urlPtrPos = urlPtrPos;
zf.titlePtrPos = readInt(header, 40, 8);
zf.articlePtrPos = null; // Calculated async by setListings() called from zimArchive.js
zf.clusterPtrPos = readInt(header, 48, 8);
zf.mimeListPos = mimeListPos;
zf.mainPage = readInt(header, 64, 4);
zf.layoutPage = readInt(header, 68, 4);
zf.mimeTypes = data;
zf.mimeTypes = mapData;
return zf;
});
});