Port proposed caching changes from Kiwix JS

Former-commit-id: 727c78557a98457d622955b2c1b30a909f6e5a89 [formerly 3820e67fdd6c756ba92b0b4fdee9a0c6a5bc805b [formerly 427a6213b23abc46e869ed8d5f061691cd7cedad]] Former-commit-id: 0c9e50cac31aa043c2bb3d19c1907f3cbc819de7 Former-commit-id: 94d9599a6683a00eb486a46bf98edf797e985301
2025-09-09 12:19:46 -04:00 · 2020-10-31 18:44:24 +00:00 · 2020-10-31 18:44:24 +00:00 · 58944df5db
commit 58944df5db
parent d797531593
2 changed files with 96 additions and 55 deletions
--- a/www/js/lib/filecache.js
+++ b/www/js/lib/filecache.js
@ -1,5 +1,7 @@
 /**
- * filecache.js: Generic least-recently-used-cache used for reading file chunks.
+ * filecache.js: Generic cache for small, frequently read file slices.
+ * It discards cached blocks according to a least-recently-used algorithm.
+ * It is used primarily for fast Directory Entry lookup, speeding up binary search.
 *
 * Copyright 2020 Mossroy, peter-x, jaifroid and contributors
 * License GPL v3:
@ -35,28 +37,39 @@ define(['q'], function(Q) {
    const BLOCK_SIZE = 4096;

    /**
-     * Creates a new cache with max size limit
-     * @param {Integer} limit The maximum number of blocks of BLOCK_SIZE to be cached
+     * A Cache Entry
+     * @typedef CacheEntry
+     * @property {String} id The cache key (stored also in the entry)
+     * @property {CacheEntry} prev The previous linked cache entry
+     * @property {CacheEntry} next The next linked cache entry
+     * @property {Uint8Array} value The cached data
     */
-    function LRUCache(limit) {
-        console.log("Creating cache of size " + limit);
-        this._limit = limit;
-        this._size = 0;
-        // Mapping from id to {value: , prev: , next: }
-        this._entries = {};
-        // linked list of entries
-        this._first = null;
-        this._last = null;
+
+    /**
+     * A Block Cache employing a Least Recently Used caching strategy
+     * @typedef BlockCache
+     * @property {Integer} _limit The maximum number of entries in the cache 
+     * @property {Map} _entries A map to store the cache keys and data
+     * @property {CacheEntry} _first The most recent entry in the cache
+     * @property {CacheEntry} _last The least recedntly used entry in the cache
+     */
+
+    /**
+     * Creates a new cache with max size limit of MAX_CACHE_SIZE blocks
+     */
+    function LRUCache() {
+        console.log('Creating cache of size ' + MAX_CACHE_SIZE + ' * ' + BLOCK_SIZE + ' bytes');
+        this._limit = MAX_CACHE_SIZE;
    }

    /**
     * Tries to retrieve an element by its id. If it is not present in the cache, returns undefined; if it is present,
     * then the value is returned and the entry is moved to the top of the cache
-     * @param {String} id The block cache entry key
+     * @param {String} key The block cache entry key (byte offset + '' + file.id)
     * @returns {Uint8Array|undefined} The requested cache data or undefined 
     */
-    LRUCache.prototype.get = function(id) {
-        var entry = this._entries[id]; 
+    LRUCache.prototype.get = function (key) {
+        var entry = this._entries.get(key);
        if (entry === undefined) {
            return entry;
        }
@ -66,30 +79,31 @@ define(['q'], function(Q) {
    
    /**
     * Stores a value in the cache by id and prunes the least recently used entry if the cache is larger than MAX_CACHE_SIZE
-     * @param {String} id The key under which to store the value (consists of filename + file number)
-     * @param {Uint16Array} value The value to store in the cache 
+     * @param {String} key The key under which to store the value (byte offset + '' + file.id from start of ZIM archive)
+     * @param {Uint8Array} value The value to store in the cache 
     */
-    LRUCache.prototype.store = function(id, value) {
-        var entry = this._entries[id];
+    LRUCache.prototype.store = function (key, value) {
+        var entry = this.get(key);
        if (entry === undefined) {
-            entry = this._entries[id] = {id: id, prev: null, next: null, value: value};
+            entry = {
+                id: key,
+                prev: null,
+                next: null,
+                value: value
+            };
+            this._entries.set(key, entry);
            this.insertAtTop(entry);
-            if (this._size >= this._limit) {
+            if (this._entries.size >= this._limit) {
                var e = this._last;
                this.unlink(e);
-                delete this._entries[e.id];
-            } else {
-                this._size++;
+                this._entries.delete(e.id);
            }
-        } else {
-            entry.value = value;
-            this.moveToTop(entry);
        }
    };

    /**
     * Delete a cache entry
-     * @param {String} entry The entry to delete 
+     * @param {CacheEntry} entry The entry to delete 
     */
    LRUCache.prototype.unlink = function(entry) {
        if (entry.next === null) {
@ -106,7 +120,7 @@ define(['q'], function(Q) {

    /**
     * Insert a cache entry at the top of the cache
-     * @param {String} entry The entry to insert 
+     * @param {CacheEntry} entry The entry to insert 
     */
    LRUCache.prototype.insertAtTop = function(entry) {
        if (this._first === null) {
@ -120,27 +134,42 @@ define(['q'], function(Q) {

    /**
     * Move a cache entry to the top of the cache
-     * @param {String} entry The entry to move 
+     * @param {CacheEntry} entry The entry to move 
     */
    LRUCache.prototype.moveToTop = function(entry) {
        this.unlink(entry);
        this.insertAtTop(entry);
    };

-    // Create a new cache
-    var cache = new LRUCache(MAX_CACHE_SIZE);
+    /**
+
+     * A new Block Cache
+     * @type {BlockCache}
+     */
+    var cache = new LRUCache();
    
    // Counters for reporting only
    var hits = 0;
    var misses = 0;

+    /**
+     * Initializes or resets the cache - this should be called whenever a new ZIM is loaded
+     */
+    var init = function () {
+        console.log('Initialize or reset FileCache');
+        cache._entries = new Map();
+        // Initialize linked list of entries
+        cache._first = null;
+        cache._last = null;
+    };
    /**
     * Read a certain byte range in the given file, breaking the range into chunks that go through the cache
-     * If a read of more than blocksize (bytes) is requested, do not use the cache
+     * If a read of more than BLOCK_SIZE * 2 (bytes) is requested, do not use the cache
     * @param {Object} file The requested ZIM archive to read from
     * @param {Integer} begin The byte from which to start reading
     * @param {Integer} end The byte at which to stop reading (end will not be read)
-     * @return {Promise<Uint8Array>} A Promise that resolves to the correctly concatenated data from the split ZIM file set
+     * @return {Promise<Uint8Array>} A Promise that resolves to the correctly concatenated data from the cache 
+     *     or from the ZIM archive
     */
    var read = function(file, begin, end) {
        // Read large chunks bypassing the block cache because we would have to
@ -148,19 +177,21 @@ define(['q'], function(Q) {
        if (end - begin > BLOCK_SIZE * 2) return file._readSplitSlice(begin, end);
        var readRequests = [];
        var blocks = {};
-        for (var i = Math.floor(begin / BLOCK_SIZE) * BLOCK_SIZE; i < end; i += BLOCK_SIZE) {
-            var block = cache.get(file.name + i);
+        // Look for the requested data in the blocks: we may need to stitch together data from two or more blocks
+        for (var id = Math.floor(begin / BLOCK_SIZE) * BLOCK_SIZE; id < end; id += BLOCK_SIZE) {
+            var block = cache.get(id + '' + file.id);
            if (block === undefined) {
+                // Data not in cache, so read from archive
                misses++;
                readRequests.push(function(offset) {
                    return file._readSplitSlice(offset, offset + BLOCK_SIZE).then(function(result) {
-                        cache.store(file.name + offset, result);
+                        cache.store(offset + '' + file.id, result);
                        blocks[offset] = result;
                    });
-                }(i));
+                }(id));
            } else {
                hits++;
-                blocks[i] = block;
+                blocks[id] = block;
            }
        }
        if (misses + hits > 2000) {
@ -168,9 +199,11 @@ define(['q'], function(Q) {
            hits = 0;
            misses = 0;
        }
+        // Wait for all the blocks to be read either from the cache or from the archive
        return Q.all(readRequests).then(function() {
            var result = new Uint8Array(end - begin);
            var pos = 0;
+            // Stitch together the data parts in the right order
            for (var i = Math.floor(begin / BLOCK_SIZE) * BLOCK_SIZE; i < end; i += BLOCK_SIZE) {
                var b = Math.max(i, begin) - i;
                var e = Math.min(end, i + BLOCK_SIZE) - i;
@ -182,6 +215,7 @@ define(['q'], function(Q) {
    };

    return {
-        read: read
+        read: read,
+        init: init
    };
 });
--- a/www/js/lib/zimfile.js
+++ b/www/js/lib/zimfile.js
@ -22,6 +22,13 @@
 'use strict';
 define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry', 'filecache'], function(xz, zstd, util, utf8, Q, zimDirEntry, FileCache) {

+    /**
+     * A variable to keep track of the currently loaded ZIM archive, e.g., for labelling cache entries
+     * The ID is temporary and is reset to 0 at each session start; it is incremented by 1 each time a new ZIM is loaded
+     * @type {Integer} 
+     */
+    var tempFileId = 0;
+
    var readInt = function (data, offset, size) {
        var r = 0;
        for (var i = 0; i < size; i++) {
@ -38,15 +45,15 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
     * 
     * @typedef ZIMFile
     * @property {Array<File>} _files Array of ZIM files
-     * @property {String} name Abstract name of ZIM file set
-     * @property {Integer} articleCount total number of articles
-     * @property {Integer} clusterCount total number of clusters
-     * @property {Integer} urlPtrPos position of the directory pointerlist ordered by URL
-     * @property {Integer} titlePtrPos position of the directory pointerlist ordered by title
-     * @property {Integer} clusterPtrPos position of the cluster pointer list
-     * @property {Integer} mimeListPos position of the MIME type list (also header size)
-     * @property {Integer} mainPage main page or 0xffffffff if no main page
-     * @property {Integer} layoutPage layout page or 0xffffffffff if no layout page
+     * @property {Integer} id Arbitrary numeric ZIM id used to track the currently loaded archive
+     * @property {Integer} articleCount Total number of articles
+     * @property {Integer} clusterCount Total number of clusters
+     * @property {Integer} urlPtrPos Position of the directory pointerlist ordered by URL
+     * @property {Integer} titlePtrPos Position of the directory pointerlist ordered by title
+     * @property {Integer} clusterPtrPos Position of the cluster pointer list
+     * @property {Integer} mimeListPos Position of the MIME type list (also header size)
+     * @property {Integer} mainPage Main page or 0xffffffff if no main page
+     * @property {Integer} layoutPage Layout page or 0xffffffffff if no layout page
     */
    
    /**
@ -70,7 +77,7 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
    };

    /**
-     * Read a slice from the ZIM set starting at offset for size of bytes
+     * Read a slice from the FileCache or ZIM set, starting at offset for size of bytes
     * @param {Integer} offset The absolute offset from the start of the ZIM file or file set at which to start reading
     * @param {Integer} size The number of bytes to read
     * @returns {Promise<Uint8Array>} A Promise for a Uint8Array containing the requested data
@ -105,7 +112,6 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
            return readRequests[0];
        } else {
            // Wait until all are resolved and concatenate.
-            console.log("CONCAT");
            return Q.all(readRequests).then(function(arrays) {
                var concatenated = new Uint8Array(end - begin);
                var offset = 0;
@ -119,7 +125,7 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
    };

    /**
-     * Read and parse a a Directory Entry at the given archive offset
+     * Read and parse a Directory Entry at the given archive offset
     * @param {Integer} offset The offset at which the DirEntry is located
     * @returns {Promise<DirEntry>} A Promise for the requested DirEntry
     */
@ -279,9 +285,8 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
                var urlPtrPos = readInt(header, 32, 8);
                return readMimetypeMap(fileArray[0], mimeListPos, urlPtrPos).then(function (data) {
                    var zf = new ZIMFile(fileArray);
-                    // Line below provides an abstracted filename in case the ZIM file is split into multiple parts;
-                    // it greatly simplifies coding of the block cache, as it can store and respond to offsets from the start of the file set
-                    zf.name = fileArray[0].name.replace(/(\.zim)\w\w$/i, '$1');
+                    // Line below provides a temporary, per-session numeric ZIM ID used in filecache.js
+                    zf.id = tempFileId++;
                    zf.articleCount = readInt(header, 24, 4);
                    zf.clusterCount = readInt(header, 28, 4);
                    zf.urlPtrPos = urlPtrPos;
@ -291,6 +296,8 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
                    zf.mainPage = readInt(header, 64, 4);
                    zf.layoutPage = readInt(header, 68, 4);
                    zf.mimeTypes = data;
+                    // Initialize or reset the FileCache
+                    FileCache.init();
                    return zf;
                });
            });