mirror of
https://github.com/kiwix/kiwix-js-pwa.git
synced 2025-08-10 23:08:27 -04:00

Former-commit-id: c782a998f6cb60785b590d5b65ac77d22b03a126 [formerly 2b27b93e05fe72760e97eae83808c8517de1cd13 [formerly b081098996accb5e5d076d1cc4193142b06830e9]] Former-commit-id: a7e376811f1a71e85a64b746c2a58d608bec988b Former-commit-id: 1b62e0abf09d742a88311aeea1545c0b53c2adea
470 lines
23 KiB
JavaScript
470 lines
23 KiB
JavaScript
/**
|
|
* zimfile.js: Low-level ZIM file reader.
|
|
*
|
|
* Copyright 2015 Mossroy and contributors
|
|
* License GPL v3:
|
|
*
|
|
* This file is part of Kiwix.
|
|
*
|
|
* Kiwix is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* Kiwix is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with Kiwix (file LICENSE-GPLv3.txt). If not, see <http://www.gnu.org/licenses/>
|
|
*/
|
|
'use strict';
|
|
|
|
/**
|
|
* Add Polyfill currently required by IE11 to run zstddec-asm and xzdec-asm
|
|
* See https://github.com/emscripten-core/emscripten/issues/14700
|
|
* If this is resolved upstream, remove this polyfill
|
|
* Source: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/startsWith
|
|
*/
|
|
if (!String.prototype.startsWith) {
|
|
Object.defineProperty(String.prototype, 'startsWith', {
|
|
value: function(search, rawPos) {
|
|
var pos = rawPos > 0 ? rawPos|0 : 0;
|
|
return this.substring(pos, pos + search.length) === search;
|
|
}
|
|
});
|
|
}
|
|
|
|
/**
|
|
* A global variable to track the assembler machine type and the last used decompressor (for reporting to the API panel)
|
|
* This is populated in the Emscripten wrappers
|
|
* @type {Object}
|
|
* @property {String} assemblerMachineType The assembler machine type supported and/or loaded by this app: 'ASM' or 'WASM'
|
|
* @property {String} decompressorLastUsed The decompressor that was last used to decode a compressed cluster (currently 'XZ' or 'ZSTD')
|
|
* @property {String} errorStatus A description of any detected error in loading a decompressor
|
|
*/
|
|
params.decompressorAPI = {
|
|
assemblerMachineType: null,
|
|
decompressorLastUsed: null,
|
|
errorStatus: null
|
|
};
|
|
|
|
define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'zimDirEntry', 'filecache'], function(xz, zstd, util, utf8, zimDirEntry, FileCache) {
|
|
|
|
/**
|
|
* A variable to keep track of the currently loaded ZIM archive, e.g., for labelling cache entries
|
|
* The ID is temporary and is reset to 0 at each session start; it is incremented by 1 each time a new ZIM is loaded
|
|
* @type {Integer}
|
|
*/
|
|
var tempFileId = 0;
|
|
|
|
/**
|
|
* A Map to keep track of temporary File IDs
|
|
* @type {Map}
|
|
*/
|
|
var fileIDs = new Map();
|
|
|
|
var readInt = function (data, offset, size) {
|
|
var r = 0;
|
|
for (var i = 0; i < size; i++) {
|
|
var c = (data[offset + i] + 256) & 0xff;
|
|
r += util.leftShift(c, 8 * i);
|
|
}
|
|
return r;
|
|
};
|
|
|
|
/**
|
|
* A ZIM File
|
|
*
|
|
* See https://wiki.openzim.org/wiki/ZIM_file_format#Header
|
|
*
|
|
* @typedef {Object} ZIMFile
|
|
* @property {Array<File>} _files Array of ZIM files
|
|
* @property {String} name Abstract archive name for file set
|
|
* @property {Integer} id Arbitrary numeric ZIM id used to track the currently loaded archive
|
|
* @property {Integer} entryCount Total number of entries in the URL pointerlist
|
|
* @property {Integer} articleCount Total number of article titles in the v1 article-only pointerlist (async calculated entry)
|
|
* @property {Integer} clusterCount Total number of clusters
|
|
* @property {Integer} urlPtrPos Position of the directory pointerlist ordered by URL
|
|
* @property {Integer} titlePtrPos Position of the legacy v0 pointerlist ordered by title
|
|
* @property {Integer} articlePtrPos Position of the v1 article-only pointerlist ordered by title (async calculated entry)
|
|
* @property {Integer} clusterPtrPos Position of the cluster pointer list
|
|
* @property {Integer} mimeListPos Position of the MIME type list (also header size)
|
|
* @property {Integer} mainPage Main page or 0xffffffff if no main page
|
|
* @property {Integer} layoutPage Layout page or 0xffffffffff if no layout page
|
|
* @property {Map} mimeTypes The ZIM file's MIME type table rendered as a Map (calculated entry)
|
|
*/
|
|
|
|
/**
|
|
* Abstract an array of one or more (split) ZIM archives
|
|
* @param {Array<File>} abstractFileArray An array of ZIM file parts
|
|
*/
|
|
function ZIMFile(abstractFileArray) {
|
|
this._files = abstractFileArray;
|
|
}
|
|
|
|
/**
|
|
* Read and decode an integer value from the ZIM archive
|
|
* @param {Integer} offset The offset at which the integer is found
|
|
* @param {Integer} size The size of data to read
|
|
* @returns {Promise<Integer>} A Promise for the returned value
|
|
*/
|
|
ZIMFile.prototype._readInteger = function (offset, size) {
|
|
return this._readSlice(offset, size).then(function (data) {
|
|
return readInt(data, 0, size);
|
|
});
|
|
};
|
|
|
|
/**
|
|
* Read a slice from the FileCache or ZIM set, starting at offset for size of bytes
|
|
* @param {Integer} offset The absolute offset from the start of the ZIM file or file set at which to start reading
|
|
* @param {Integer} size The number of bytes to read
|
|
* @returns {Promise<Uint8Array>} A Promise for a Uint8Array containing the requested data
|
|
*/
|
|
ZIMFile.prototype._readSlice = function(offset, size) {
|
|
return FileCache.read(this, offset, offset + size);
|
|
};
|
|
|
|
/**
|
|
* Read a slice from a set of one or more ZIM files constituting a single archive, and concatenate the data parts
|
|
* @param {Integer} begin The absolute byte offset from which to start reading
|
|
* @param {Integer} end The absolute byte offset where reading should stop (the end byte is not read)
|
|
* @returns {Promise<Uint8Array>} A Promise for a Uint8Array containing the concatenated data
|
|
*/
|
|
ZIMFile.prototype._readSplitSlice = function (begin, end) {
|
|
var file = this;
|
|
var readRequests = [];
|
|
var currentOffset = 0;
|
|
for (var i = 0; i < file._files.length; currentOffset += file._files[i].size, ++i) {
|
|
var currentSize = file._files[i].size;
|
|
if (begin < currentOffset + currentSize && currentOffset < end) {
|
|
// DEV: Math.max is used below because we could be reading the last part of a blob split across two files,
|
|
// in which case (begin - currentOffset) could be negative!
|
|
var readStart = Math.max(0, begin - currentOffset);
|
|
var readEnd = Math.min(currentSize, end - currentOffset);
|
|
readRequests.push(util.readFileSlice(file._files[i], readStart, readEnd));
|
|
}
|
|
}
|
|
if (readRequests.length === 0) {
|
|
return Promise.resolve(new Uint8Array(0).buffer);
|
|
} else if (readRequests.length === 1) {
|
|
return readRequests[0];
|
|
} else {
|
|
// Wait until all are resolved and concatenate.
|
|
return Promise.all(readRequests).then(function (arrays) {
|
|
var concatenated = new Uint8Array(end - begin);
|
|
var offset = 0;
|
|
arrays.forEach(function (item) {
|
|
concatenated.set(new Uint8Array(item), offset);
|
|
offset += item.byteLength;
|
|
});
|
|
return concatenated;
|
|
});
|
|
}
|
|
};
|
|
|
|
/**
|
|
* Read and parse a Directory Entry at the given archive offset
|
|
* @param {Integer} offset The offset at which the DirEntry is located
|
|
* @returns {Promise<DirEntry>} A Promise for the requested DirEntry
|
|
*/
|
|
ZIMFile.prototype.dirEntry = function (offset) {
|
|
var that = this;
|
|
return this._readSlice(offset, 2048).then(function (data) {
|
|
var dirEntry = {
|
|
offset: offset,
|
|
mimetypeInteger: readInt(data, 0, 2),
|
|
namespace: String.fromCharCode(data[3])
|
|
};
|
|
dirEntry.redirect = (dirEntry.mimetypeInteger === 0xffff);
|
|
if (dirEntry.redirect) {
|
|
dirEntry.redirectTarget = readInt(data, 8, 4);
|
|
} else {
|
|
dirEntry.cluster = readInt(data, 8, 4);
|
|
dirEntry.blob = readInt(data, 12, 4);
|
|
}
|
|
var pos = dirEntry.redirect ? 12 : 16;
|
|
if (data.subarray) {
|
|
dirEntry.url = utf8.parse(data.subarray(pos), true);
|
|
while (data[pos] !== 0)
|
|
pos++;
|
|
dirEntry.title = utf8.parse(data.subarray(pos + 1), true);
|
|
return new zimDirEntry.DirEntry(that, dirEntry);
|
|
}
|
|
});
|
|
};
|
|
|
|
/**
|
|
* Find a Directory Entry based on its URL Pointer index
|
|
* @param {Integer} index The URL Pointer index to the DirEntry
|
|
* @returns {Promise<DirEntry>} A Promise for the requested DirEntry
|
|
*/
|
|
ZIMFile.prototype.dirEntryByUrlIndex = function (index) {
|
|
var that = this;
|
|
return this._readInteger(this.urlPtrPos + index * 8, 8).then(function (dirEntryPos) {
|
|
return that.dirEntry(dirEntryPos);
|
|
});
|
|
};
|
|
|
|
/**
|
|
* Find a Directory Entry based on its Title Pointer index
|
|
* @param {Integer} index The Title Pointer index to the DirEntry
|
|
* @returns {Promise<DirEntry>} A Promise for the requested DirEntry
|
|
*/
|
|
ZIMFile.prototype.dirEntryByTitleIndex = function (index) {
|
|
var that = this;
|
|
// Use v1 title pointerlist if available, or fall back to legacy v0 list
|
|
var ptrList = this.articlePtrPos || this.titlePtrPos;
|
|
return this._readInteger(ptrList + index * 4, 4).then(function (urlIndex) {
|
|
return that.dirEntryByUrlIndex(urlIndex);
|
|
});
|
|
};
|
|
|
|
/**
|
|
* Read and if necessary decompress a BLOB based on its cluster number and blob number
|
|
* @param {Integer} cluster The cluster number where the blob is to be found
|
|
* @param {Integer} blob The blob number within the cluster
|
|
* @param {Boolean} meta If true, and if the cluster is uncompressed, the function will return only the blob's metadata
|
|
* (its archive offset and its size), otherwise return null
|
|
* @returns {Promise<Uint8Array>} A Promise for the BLOB's data
|
|
*/
|
|
ZIMFile.prototype.blob = function (cluster, blob, meta) {
|
|
var that = this;
|
|
return this._readSlice(this.clusterPtrPos + cluster * 8, 16).then(function (clusterOffsets) {
|
|
var clusterOffset = readInt(clusterOffsets, 0, 8);
|
|
var nextCluster = readInt(clusterOffsets, 8, 8);
|
|
// DEV: The method below of calculating cluster size is not safe: see https://github.com/openzim/libzim/issues/84#issuecomment-612962250
|
|
// var thisClusterLength = nextCluster - clusterOffset - 1;
|
|
return that._readSlice(clusterOffset, 1).then(function (compressionType) {
|
|
var decompressor;
|
|
var plainBlobReader = function (offset, size, dataPass) {
|
|
// Check that we are not reading beyond the end of the cluster
|
|
var offsetStart = clusterOffset + 1 + offset;
|
|
if (offsetStart < nextCluster) {
|
|
// Gratuitous parentheses added for legibility
|
|
size = (offsetStart + size) <= nextCluster ? size : (nextCluster - offsetStart);
|
|
// DEV: This blob reader is called twice: on the first pass it reads the cluster's blob list,
|
|
// and on the second pass ("dataPass") it is ready to read the blob's data
|
|
if (meta && dataPass) {
|
|
// If only metadata were requested and we are on the data pass, we should now have them
|
|
return {
|
|
ptr: offsetStart,
|
|
size: size
|
|
};
|
|
} else {
|
|
return that._readSlice(offsetStart, size);
|
|
}
|
|
} else {
|
|
return Promise.resolve(new Uint8Array(0).buffer);
|
|
}
|
|
};
|
|
// If only metadata were requested and the cluster is compressed, return null (this is probably a ZIM format error)
|
|
// DEV: This is because metadata are only requested for finding absolute offsets into uncompressed clusters,
|
|
// principally for finding the start and size of a title pointer listing
|
|
if (meta && compressionType[0] > 1) return null;
|
|
if (compressionType[0] === 0 || compressionType[0] === 1) {
|
|
// uncompressed
|
|
decompressor = { readSliceSingleThread: plainBlobReader };
|
|
} else if (compressionType[0] === 4) {
|
|
decompressor = new xz.Decompressor(plainBlobReader);
|
|
} else if (compressionType[0] === 5) {
|
|
decompressor = new zstd.Decompressor(plainBlobReader);
|
|
} else {
|
|
return new Uint8Array(); // unsupported compression type
|
|
}
|
|
return decompressor.readSliceSingleThread(blob * 4, 8, false).then(function (data) {
|
|
var blobOffset = readInt(data, 0, 4);
|
|
var nextBlobOffset = readInt(data, 4, 4);
|
|
return decompressor.readSliceSingleThread(blobOffset, nextBlobOffset - blobOffset, true);
|
|
});
|
|
});
|
|
});
|
|
};
|
|
|
|
/**
|
|
* A Directory Listing object
|
|
* @typedef {Object} DirListing A list of pointers to directory entries (via the URL pointerlist)
|
|
* @property {String} path The path (url) to the directory entry for the Listing
|
|
* @property {String} ptrName The name of the pointer to the Listing's data that will be added to the ZIMFile obect
|
|
* @property {String} countName The name of the key that will contain the number of entries in the Listing, to be added to the ZIMFile object
|
|
*/
|
|
|
|
/**
|
|
* Read the metadata (archive offset pointer, and number of entiries) of one or more ZIM directory Listings.
|
|
* This supports reading a subset of user content that might be ordered differently from the main URL pointerlist.
|
|
* In particular, it supports the v1 article pointerlist, which contains articles sorted by title, superseding the article
|
|
* namespace ('A') in legazy ZIM archives.
|
|
* @param {Array<DirListing>} listings An array of DirListing objects (see zimArchive.js for examples)
|
|
* @returns {Promise} A promise that populates calculated entries in the ZIM file header
|
|
*/
|
|
ZIMFile.prototype.setListings = function(listings) {
|
|
var that = this;
|
|
// If we are in a legacy ZIM archive, we need to calculate the true article count (of entries in the A namespace)
|
|
// This effectively emulates the v1 article pointerlist
|
|
if (this.minorVersion === 0) {
|
|
console.debug('ZIM DirListing version: 0 (legacy)', this);
|
|
// Initiate a binary search for the first or last article
|
|
var getArticleIndexByOrdinal = function (ordinal) {
|
|
return util.binarySearch(0, that.entryCount, function(i) {
|
|
return that.dirEntryByTitleIndex(i).then(function(dirEntry) {
|
|
var ns = dirEntry.namespace;
|
|
var url = ns + '/' + dirEntry.getTitleOrUrl();
|
|
var prefix = ordinal === 'first' ? 'A' : 'B';
|
|
if (prefix < ns) return -1;
|
|
else if (prefix > ns) return 1;
|
|
return prefix < url ? -1 : 1;
|
|
});
|
|
}, true).then(function(index) {
|
|
return index;
|
|
});
|
|
};
|
|
return getArticleIndexByOrdinal('first').then(function(idxFirstArticle) {
|
|
return getArticleIndexByOrdinal('last').then(function(idxLastArticle) {
|
|
// Technically idxLastArticle points to the entry after the last article in the 'A' namespace,
|
|
// We subtract the first from the last to get the number of entries in the 'A' namespace
|
|
that.articlePtrPos = that.titlePtrPos + idxFirstArticle * 4;
|
|
that.articleCount = idxLastArticle - idxFirstArticle;
|
|
console.debug('Calculated article count is: ' + that.articleCount);
|
|
});
|
|
});
|
|
}
|
|
var highestListingVersion = 0;
|
|
var listingAccessor = function (listing) {
|
|
if (!listing) {
|
|
// No more listings, so exit
|
|
console.debug('ZIM DirListing version: ' + highestListingVersion, that);
|
|
console.debug('Article count is: ' + that.articleCount);
|
|
return null;
|
|
}
|
|
// Check if we already have this listing's values, so we don't do redundant binary searches
|
|
if (that[listing.ptrName] && that[listing.countName]) {
|
|
highestListingVersion = Math.max(~~listing.path.replace(/.+(\d)$/, '$1'), highestListingVersion);
|
|
// Get the next listing
|
|
return listingAccessor(listings.pop());
|
|
}
|
|
// Initiate a binary search for the listing URL
|
|
return util.binarySearch(0, that.entryCount, function(i) {
|
|
return that.dirEntryByUrlIndex(i).then(function(dirEntry) {
|
|
var url = dirEntry.namespace + "/" + dirEntry.url;
|
|
if (listing.path < url)
|
|
return -1;
|
|
else if (listing.path > url)
|
|
return 1;
|
|
else
|
|
return 0;
|
|
});
|
|
}).then(function(index) {
|
|
if (index === null) return null;
|
|
return that.dirEntryByUrlIndex(index);
|
|
}).then(function(dirEntry) {
|
|
if (!dirEntry) return null;
|
|
// Request the metadata for the blob represented by the dirEntry
|
|
return that.blob(dirEntry.cluster, dirEntry.blob, true);
|
|
}).then(function(metadata) {
|
|
// Note that we do not accept a listing if its size is 0, i.e. if it contains no data
|
|
// (although this should not occur, we have been asked to handle it - see kiwix-js #708)
|
|
if (metadata && metadata.size) {
|
|
that[listing.ptrName] = metadata.ptr;
|
|
that[listing.countName] = metadata.size / 4; // Each entry uses 4 bytes
|
|
highestListingVersion = Math.max(~~listing.path.replace(/.+(\d)$/, '$1'), highestListingVersion);
|
|
}
|
|
// Get the next Listing
|
|
return listingAccessor(listings.pop());
|
|
}).catch(function(err) {
|
|
console.error('There was an error accessing a Directory Listing', err);
|
|
});
|
|
};
|
|
listingAccessor(listings.pop());
|
|
};
|
|
|
|
/**
|
|
* Reads the whole MIME type list and returns it as a populated Map
|
|
* The mimeTypeMap is extracted once after the user has picked the ZIM file
|
|
* and is stored as ZIMFile.mimeTypes
|
|
* @param {File} file The ZIM file (or first file in array of files) from which the MIME type list
|
|
* is to be extracted
|
|
* @param {Integer} mimeListPos The offset in <file> at which the MIME type list is found
|
|
* @param {Integer} urlPtrPos The offset of URL Pointer List in the archive
|
|
* @returns {Promise} A promise for the MIME Type list as a Map
|
|
*/
|
|
function readMimetypeMap(file, mimeListPos, urlPtrPos) {
|
|
var typeMap = new Map;
|
|
var size = urlPtrPos - mimeListPos;
|
|
// ZIM archives produced since May 2020 relocate the URL Pointer List to the end of the archive
|
|
// so we limit the slice size to max 1024 bytes in order to prevent reading the entire archive into an array buffer
|
|
// See https://github.com/openzim/libzim/issues/353
|
|
size = size > 1024 ? 1024 : size;
|
|
return util.readFileSlice(file, mimeListPos, mimeListPos + size).then(function (data) {
|
|
if (data.subarray) {
|
|
var i = 0;
|
|
var pos = -1;
|
|
var mimeString;
|
|
while (pos < size) {
|
|
pos++;
|
|
mimeString = utf8.parse(data.subarray(pos), true);
|
|
// If the parsed data is an empty string, we have reached the end of the MIME type list, so break
|
|
if (!mimeString) break;
|
|
// Store the parsed string in the Map
|
|
typeMap.set(i, mimeString);
|
|
i++;
|
|
while (data[pos]) {
|
|
pos++;
|
|
}
|
|
}
|
|
}
|
|
return typeMap;
|
|
}).catch(function (err) {
|
|
console.error('Unable to read MIME type list', err);
|
|
return new Map;
|
|
});
|
|
}
|
|
|
|
return {
|
|
/**
|
|
* @param {Array<File>} fileArray An array of picked archive files
|
|
* @returns {Promise<Object>} A Promise for the ZimFile Object
|
|
*/
|
|
fromFileArray: function (fileArray) {
|
|
// Array of blob objects should be sorted by their name property
|
|
fileArray.sort(function (a, b) {
|
|
var nameA = a.name.toUpperCase();
|
|
var nameB = b.name.toUpperCase();
|
|
if (nameA < nameB) return -1;
|
|
if (nameA > nameB) return 1;
|
|
return 0;
|
|
});
|
|
return util.readFileSlice(fileArray[0], 0, 80).then(function (header) {
|
|
var mimeListPos = readInt(header, 56, 8);
|
|
var urlPtrPos = readInt(header, 32, 8);
|
|
return readMimetypeMap(fileArray[0], mimeListPos, urlPtrPos).then(function (mapData) {
|
|
var zf = new ZIMFile(fileArray);
|
|
// Add an abstract archive name (ignoring split file extensions)
|
|
zf.name = fileArray[0].name.replace(/(\.zim)\w\w$/i, '$1');
|
|
// Provide a temporary, per-session numeric ZIM ID used in filecache.js
|
|
zf.id = fileIDs.get(zf.name);
|
|
if (zf.id === undefined) {
|
|
zf.id = tempFileId++;
|
|
fileIDs.set(zf.name, zf.id);
|
|
}
|
|
// For a description of these values, see https://wiki.openzim.org/wiki/ZIM_file_format
|
|
zf.majorVersion = readInt(header, 4, 2); // Not currently used by this implementation
|
|
zf.minorVersion = readInt(header, 6, 2); // Used to determine the User Content namespace
|
|
zf.entryCount = readInt(header, 24, 4);
|
|
zf.articleCount = null; // Calculated async by setListings() called from zimArchive.js
|
|
zf.clusterCount = readInt(header, 28, 4);
|
|
zf.urlPtrPos = urlPtrPos;
|
|
zf.titlePtrPos = readInt(header, 40, 8);
|
|
zf.articlePtrPos = null; // Calculated async by setListings()
|
|
zf.clusterPtrPos = readInt(header, 48, 8);
|
|
zf.mimeListPos = mimeListPos;
|
|
zf.mainPage = readInt(header, 64, 4);
|
|
zf.layoutPage = readInt(header, 68, 4);
|
|
zf.mimeTypes = mapData;
|
|
return zf;
|
|
});
|
|
});
|
|
}
|
|
};
|
|
});
|