From c6489080ffbae9c87b18aeb2d48a13191a12c182 Mon Sep 17 00:00:00 2001 From: Jaifroid Date: Thu, 3 Sep 2020 08:05:28 +0100 Subject: [PATCH] Add updates from kiwix-js Former-commit-id: 25d2700cb2ecc3c22bcd70833ed514fc6a72724d [formerly ff1f960c784b76dc2d2ecd4d7bdbd58524d1f14f [formerly 52dfa73d8c84e3816e2c09c9c03304efff590076]] Former-commit-id: 4fb64b04d17666efb8f42fabf45e413c582dde45 Former-commit-id: 81c61791af26b259ec2510acb9e73dc48ae6d106 --- www/js/lib/zstdec_wrapper.js | 194 ++++++++++++++++++----------------- 1 file changed, 98 insertions(+), 96 deletions(-) diff --git a/www/js/lib/zstdec_wrapper.js b/www/js/lib/zstdec_wrapper.js index bff2b5fa..85fbea7d 100644 --- a/www/js/lib/zstdec_wrapper.js +++ b/www/js/lib/zstdec_wrapper.js @@ -25,6 +25,16 @@ define(['q', 'zstdec'], function(Q) { // Note that we include zstdec above in requireJS definition, but we cannot change the name in the function list // There is no longer any need to load it in index.html // For explanation of loading method below to avoid conflicts, see https://github.com/emscripten-core/emscripten/blob/master/src/settings.js + + /** + * The ZSTD Decoder instance + * @constructor Constructs the zd object representing a ZSTD decoder Emscripten instance + * @property {Integer} _decHandle The decoder stream context object in asm memory (to be re-used for each decoder operation) + * @property {Object} _inBuffer A JS copy of the inBuffer structure to be set in asm memory (malloc) + * @property {Object} _outBuffer A JS copy of the outBuffer structure to be set in asm memory (malloc) + * @property {Integer} _chunkSize The number of compressed bytes to feed to the decompressor in any one read loop + + */ var zd; ZD().then(function(instance) { // Instantiate the zd object @@ -35,6 +45,40 @@ define(['q', 'zstdec'], function(Q) { // Get a permanent decoder handle (pointer to control structure) // NB there is no need to change this handle even between ZIM loads: zstddeclib encourages re-using assigned structures zd._decHandle = zd._ZSTD_createDStream(); + // DEV set chunkSize according to memory environment; for systems with plenty of memory, + // zd can provide a max recommended size with zd._chunkSize = zd._ZSTD_DStreamInSize(); + zd._chunkSize = 5 * 1024; + + // Initialize inBuffer + zd._inBuffer = { + ptr: null, /* pointer to this inBuffer structure in w/asm memory */ + src: null, /* void* src < start of input buffer */ + size: zd._chunkSize, /* size_t size < size of input buffer */ + pos: 0 /* size_t pos; < position where reading stopped. Will be updated. Necessarily 0 <= pos <= size */ + }; + // Reserve w/asm memory for the inBuffer structure (we will populate assigned memory later) + zd._inBuffer.ptr = mallocOrDie(3 << 2); // 3 x 32bit bytes + // Reserve w/asm memory for the inBuffer data stream + zd._inBuffer.src = mallocOrDie(zd._inBuffer.size); + + // DEV: Size of outBuffer is currently set as recommended by zd._ZSTD_DStreamOutSize() below; if you are running into + // memory issues, it may be possible to reduce memory consumption by setting a smaller outBuffer size here and + // reompiling zstdec.js with lower TOTAL_MEMORY (or just search for INITIAL_MEMORY in zstdec.js and change it) + var recOutBufSize = zd._chunkSize * 4; + var maxOutBufSize = zd._ZSTD_DStreamOutSize(); + var outBufSize = recOutBufSize > maxOutBufSize ? maxOutBufSize : recOutBufSize; + + // Initialize outBuffer + zd._outBuffer = { + ptr: null, /* pointer to this outBuffer structure in asm/wasm memory */ + dst: null, /* void* dst < start of output buffer (pointer) */ + size: outBufSize, /* size_t size < size of output buffer */ + pos: 0 /* size_t pos < position where writing stopped. Will be updated. Necessarily 0 <= pos <= size */ + }; + // Reserve w/asm memory for the outBuffer structure + zd._outBuffer.ptr = mallocOrDie(3 << 2); // 3 x 32bit bytes + // Reserve w/asm memory for the outBuffer data steam + zd._outBuffer.dst = mallocOrDie(zd._outBuffer.size); }); /** @@ -51,33 +95,26 @@ define(['q', 'zstdec'], function(Q) { /** * @typedef Decompressor - * @property {Integer} _chunkSize The amount to feed to the decompressor in any one read loop * @property {FileReader} _reader The filereader to use (uses plain blob reader defined in zimfile.js) * @property {Integer} _inStreamPos The current known position in the steam of compressed bytes * @property {Integer} _inStreamChunkedPos The position once the currently loaded chunk will have been consumed * @property {Integer} _outStreamPos The position in the decoded byte stream (offset from start of cluster) - * @property {Array} _outDataBuf The buffer that stores decoded bytes (it is set to the requested blob's lenght, and when full, the data are returned) + * @property {Array} _outDataBuf The buffer that stores decoded bytes (it is set to the requested blob's length, and when full, the data are returned) * @property {Integer} _outDataBufPos The number of bytes of the requested blob decoded so far - * @property {Object} _inBuffer A JS copy of the inBuffer structure to be set in decompressor memory (malloc) - * @property {Object} _outBuffer A JS copy of the outBuffer structure to be set in decompressor memory (malloc) */ /** * @constructor - * @param {FileReader} reader - * @param {Integer} chunkSize - * @returns {Decompressor} + * @param {FileReader} reader The reader used to extract file slices (defined in zimfile.js) */ - function Decompressor(reader, chunkSize) { - this._chunkSize = chunkSize || 5 * 1024; - // this._chunkSize = chunkSize || zd._ZSTD_DStreamInSize(); + function Decompressor(reader) { this._reader = reader; } /** - * Read length bytes, offset into the decompressed stream. Consecutive calls may only - * advance in the stream and may not overlap. - * @param {Integer} offset Offset from which to start reading - * @param {Integer} length Number of bytes to read + * Set up the decompression stream, and initiate a read loop to decompress from the beginning of the cluster + * until we reach in the decompressed byte stream + * @param {Integer} offset Cluster offset (in deocmpressed stream) from which to start reading + * @param {Integer} length Number of decompressed bytes to read * @returns {Promise} Promise for an ArrayBuffer with decoded data */ Decompressor.prototype.readSlice = function(offset, length) { @@ -87,28 +124,6 @@ define(['q', 'zstdec'], function(Q) { this._outStreamPos = 0; this._outDataBuf = new Int8Array(new ArrayBuffer(length)); this._outDataBufPos = 0; - - // Initialize inBuffer - this._inBuffer = { - ptr: null, /* pointer to this inBuffer structure in w/asm memory */ - src: null, /* void* src < start of input buffer */ - size: length, /* size_t size < size of input buffer */ - pos: 0 /* size_t pos; < position where reading stopped. Will be updated. Necessarily 0 <= pos <= size */ - }; - // Reserve w/asm memory for the outBuffer structure - this._inBuffer.ptr = mallocOrDie(3 << 2); // 3 x 32bit bytes - // DEV: Size of outBuffer is currently set as recommended by zd._ZSTD_DStreamOutSize() below; if you are running into - // memory issues, it may be possible to reduce memory consumption by setting asmaller outBuffer size here and - // reompiling zstdec.js with lower TOTAL_MEMORY (or just search for INITIAL_MEMORY in zstdec.js and change it) - var recOutbufSize = zd._ZSTD_DStreamOutSize(); - // Initialize outBuffer - this._outBuffer = { - ptr: null, /* pointer to this outBuffer structure in asm/wasm memory */ - dst: null, /* void* dst < start of output buffer (pointer) */ - size: recOutbufSize, /* size_t size < size of output buffer */ - pos: 0 /* size_t pos < position where writing stopped. Will be updated. Necessarily 0 <= pos <= size */ - }; - this._outBuffer.ptr = mallocOrDie(3 << 2); // 3 x 32bit bytes var ret = zd._ZSTD_initDStream(zd._decHandle); if (zd._ZSTD_isError(ret)) { return Q.reject('Failed to initialize ZSTD decompression'); @@ -116,27 +131,23 @@ define(['q', 'zstdec'], function(Q) { var that = this; return this._readLoop(offset, length).then(function(data) { - // DEV: These structures are a known fixed length and could be assigned once, avoiding the need to free them - // currently they are re-assigned on each blob request; consider changing this if memory usage appears to grow over time - zd._free(that._inBuffer.src); - zd._free(that._inBuffer.ptr); - zd._free(that._outBuffer.dst); - zd._free(that._outBuffer.ptr); - // DEV: Freeing zd._decHandle is not needed, and actually increases memory consumption (crashing zstddeclib) - // The library explicitly encourages re-using assigned structures and handles + // DEV: We are re-using all the allocated w/asm memory, so we do not need to free any of structures assigned wiht _malloc + // However, should you need to free assigned structures use, e.g., zd._free(zd._inBuffer.src); + // Additionally, freeing zd._decHandle is not needed, and actually increases memory consumption (crashing zstddeclib) + // Should you need to free the decoder stream handle, use command below, but be sure to create a new stream control object + // before attempting further decompression // zd._ZSTD_freeDStream(zd._decHandle); busy = false; - console.log("Freed all data structures."); return data; }); }; /** - * Reads stream of data from file offset for length of bytes to send to the decompresor - * This function ensures that only one decompression runs at a time - * @param {Integer} offset The file offset at which to begin reading compressed data - * @param {Integer} length The amount of data to read - * @returns {Promise} A Promise for the read data + * This function ensures that only one decompression runs at a time, launching readSlice() only when + * the decompressor is no longer busy + * @param {Integer} offset The cluster offset (in decompressed stream) at which the requested blob resides + * @param {Integer} length The number of decompressed bytes to read + * @returns {Promise} A Promise for the readSlice() function */ Decompressor.prototype.readSliceSingleThread = function (offset, length) { if (!busy) { @@ -156,15 +167,16 @@ define(['q', 'zstdec'], function(Q) { /** * The main loop for sending compressed data to the decompressor and retrieving decompressed bytes - * @param {Integer} offset The offset in the *decompressed* byte stream at which the requeste blob resides + * Consecutive calls to readLoop may only advance in the stream and may not overlap + * @param {Integer} offset The offset in the *decompressed* byte stream at which the requested blob resides * @param {Integer} length The deomcpressed size of the requested blob + * @param {Integer} dataRequest The recommended number of bytes the docompressor has requested * @returns {Promise} A Promise for an Int8Array containing the requested blob's decompressed bytes */ - Decompressor.prototype._readLoop = function(offset, length) { + Decompressor.prototype._readLoop = function(offset, length, dataRequest) { var that = this; - return this._fillInBufferIfNeeded(offset, length).then(function() { - var ret = zd._ZSTD_decompressStream(zd._decHandle, that._outBuffer.ptr, that._inBuffer.ptr); - // var ret = zd._ZSTD_decompressStream_simpleArgs(that._decHandle, that._outBuffer.ptr, that._outBuffer.size, 0, that._inBuffer.ptr, that._inBuffer.size, 0); + return this._fillInBufferIfNeeded(offset, length, dataRequest).then(function() { + var ret = zd._ZSTD_decompressStream(zd._decHandle, zd._outBuffer.ptr, zd._inBuffer.ptr); if (zd._ZSTD_isError(ret)) { var errorMessage = "Failed to decompress data stream!\n" + zd.getErrorString(ret); console.error(errorMessage); @@ -176,17 +188,16 @@ define(['q', 'zstdec'], function(Q) { finished = true; } else if (ret > 0) { // supply more data - that._inBuffer.size = ret; + zd._inBuffer.size = ret; } // Get updated inbuffer values for processing on the JS sice // NB the zd.Decoder will read these values from its own buffers - var ibx32ptr = that._inBuffer.ptr >> 2; - that._inBuffer.pos = zd.HEAP32[ibx32ptr + 2]; + var ibx32ptr = zd._inBuffer.ptr >> 2; + zd._inBuffer.pos = zd.HEAP32[ibx32ptr + 2]; // Get updated outbuffer values - var obx32ptr = that._outBuffer.ptr >> 2; - // that._outBuffer.size = zd.HEAP32[obx32ptr + 1]; + var obx32ptr = zd._outBuffer.ptr >> 2; var outPos = zd.HEAP32[obx32ptr + 2]; // If data have been decompressed, check to see whether the data are in the offset range we need @@ -195,69 +206,59 @@ define(['q', 'zstdec'], function(Q) { console.log('**Copying decompressed bytes**\ncopyStart: ' + copyStart); if (copyStart < 0) copyStart = 0; for (var i = copyStart; i < outPos && that._outDataBufPos < that._outDataBuf.length; i++) - that._outDataBuf[that._outDataBufPos++] = zd.HEAP8[that._outBuffer.dst + i]; + that._outDataBuf[that._outDataBufPos++] = zd.HEAP8[zd._outBuffer.dst + i]; } if (that._outDataBufPos === that._outDataBuf.length) finished = true; // Increment the byte stream positions - that._inStreamPos += that._inBuffer.pos; + that._inStreamPos += zd._inBuffer.pos; that._outStreamPos += outPos; + // DEV: if outPos is > 0, then we have either copied all data from outBuffer, or we can now throw those data away + // because they are before our required offset + // Se we can now reset the asm outBuffer.pos field to 0 + zd.HEAP32[obx32ptr + 2] = 0; + // However, this isn't necessary becasuse zd._outBuffer.pos is always 0, and the buffer will be reset - WILL IT??? + // do not change the _outBuffer.size field locally; _outBuffer.size is the maximum amount the ZSTD codec is allowed + // to decode in one go, but even if it is only partially written, we just copy the decoded bytes and reset _ouBuffer.pos to 0 // TESTING (remove before merge) console.log("Offset: " + offset + "\nLength: " + length + "\ninStreamPos: " + that._inStreamPos + "\noutStreamPos: " + that._outStreamPos); - if (outPos > 0) { - // We have either copied all data from outBuffer, or we can throw those data away because they are before our required offset - // This resets the outbuffer->ptr to 0, so we can re-use the outbuffer memory space without re-initializing - // Below is the 'raw' way to do this for info, but the JS copy will be set in fillInBufferIfNeeded() - // zd.HEAP32[obx32ptr + 2] = 0; - that._outBuffer.pos = 0; - } if (finished) { console.log("Read loop finished."); return that._outDataBuf; } else { - return that._readLoop(offset, length); + return that._readLoop(offset, length, ret); } }); }; /** * Fills in the instream buffer if needed - * @param {Integer} currOffset The current read offset - * @param {Integer} len The decompressed length of data requested + * @param {Integer} req The requested number of compressed bytes (optional) * @returns {Promise<0>} A Promise for 0 when all data have been added to the stream */ - Decompressor.prototype._fillInBufferIfNeeded = function(currOffset, len) { - if (this._inStreamPos + len < this._inStreamChunkedPos) { - // We should still have enough data in the buffer (because decompressed len > compressed len) + Decompressor.prototype._fillInBufferIfNeeded = function(req) { + req = req || 0; + if (this._inStreamPos + req < this._inStreamChunkedPos) { + // We should still have enough data in the buffer // DEV: When converting to Promise/A+, use Promise.resolve(0) here return Q.when(0); } var that = this; - return this._reader(this._inStreamPos, this._chunkSize).then(function(data) { + return this._reader(this._inStreamPos, zd._chunkSize).then(function(data) { // Populate inBuffer and assign asm/wasm memory if not already assigned - that._inBuffer.size = data.length; - if (!that._inBuffer.src) { - that._inBuffer.src = mallocOrDie(that._inBuffer.size); - } - // Re-use inBuffer - that._inBuffer.pos = 0; - var inBufferStruct = new Int32Array([that._inBuffer.src, that._inBuffer.size, that._inBuffer.pos]); + zd._inBuffer.size = data.length; + // Reset inBuffer + zd._inBuffer.pos = 0; + var inBufferStruct = new Int32Array([zd._inBuffer.src, zd._inBuffer.size, zd._inBuffer.pos]); // Write inBuffer structure to previously assigned w/asm memory - zd.HEAP32.set(inBufferStruct, that._inBuffer.ptr >> 2); - // Populate outBuffer (but re-use existing if it was already assinged) - // DEV: because we're re-using the allocated memory (malloc), you cannot change the _outBuffer.size field locally - // _outBuffer.size is the maximum amount the ZSTD codec is allowed to decode in one go - // so if we need more data, we just copy those decoded bytes and reset _ouBuffer.pos to 0 - if (!that._outBuffer.dst) { - that._outBuffer.dst = mallocOrDie(that._outBuffer.size); - } - var outBufferStruct = new Int32Array([that._outBuffer.dst, that._outBuffer.size, that._outBuffer.pos]); + zd.HEAP32.set(inBufferStruct, zd._inBuffer.ptr >> 2); + var outBufferStruct = new Int32Array([zd._outBuffer.dst, zd._outBuffer.size, zd._outBuffer.pos]); // Write outBuffer structure to w/asm memory - zd.HEAP32.set(outBufferStruct, that._outBuffer.ptr >> 2); + zd.HEAP32.set(outBufferStruct, zd._outBuffer.ptr >> 2); // Transfer the (new) data to be read to the inBuffer - zd.HEAP8.set(data, that._inBuffer.src); + zd.HEAPU8.set(data, zd._inBuffer.src); that._inStreamChunkedPos += data.length; return 0; }); @@ -265,8 +266,9 @@ define(['q', 'zstdec'], function(Q) { /** * Provision asm/wasm data block and get a pointer to the assigned location - * @param {Number} sizeOfData The number of bytes to be allocated - * @returns {Number} Pointer to the assigned data block + * Code used from excellent WASM tutorial here: https://marcoselvatici.github.io/WASM_tutorial/ + * @param {Integer} sizeOfData The number of bytes to be allocated + * @returns {Integer} Pointer to the assigned data block */ function mallocOrDie(sizeOfData) { const dataPointer = zd._malloc(sizeOfData);