diff --git a/www/js/lib/filecache.js b/www/js/lib/filecache.js index 6ef216c2..1bdbca6d 100644 --- a/www/js/lib/filecache.js +++ b/www/js/lib/filecache.js @@ -1,5 +1,7 @@ /** - * filecache.js: Generic least-recently-used-cache used for reading file chunks. + * filecache.js: Generic cache for small, frequently read file slices. + * It discards cached blocks according to a least-recently-used algorithm. + * It is used primarily for fast Directory Entry lookup, speeding up binary search. * * Copyright 2020 Mossroy, peter-x, jaifroid and contributors * License GPL v3: @@ -35,28 +37,39 @@ define(['q'], function(Q) { const BLOCK_SIZE = 4096; /** - * Creates a new cache with max size limit - * @param {Integer} limit The maximum number of blocks of BLOCK_SIZE to be cached + * A Cache Entry + * @typedef CacheEntry + * @property {String} id The cache key (stored also in the entry) + * @property {CacheEntry} prev The previous linked cache entry + * @property {CacheEntry} next The next linked cache entry + * @property {Uint8Array} value The cached data */ - function LRUCache(limit) { - console.log("Creating cache of size " + limit); - this._limit = limit; - this._size = 0; - // Mapping from id to {value: , prev: , next: } - this._entries = {}; - // linked list of entries - this._first = null; - this._last = null; + + /** + * A Block Cache employing a Least Recently Used caching strategy + * @typedef BlockCache + * @property {Integer} _limit The maximum number of entries in the cache + * @property {Map} _entries A map to store the cache keys and data + * @property {CacheEntry} _first The most recent entry in the cache + * @property {CacheEntry} _last The least recedntly used entry in the cache + */ + + /** + * Creates a new cache with max size limit of MAX_CACHE_SIZE blocks + */ + function LRUCache() { + console.log('Creating cache of size ' + MAX_CACHE_SIZE + ' * ' + BLOCK_SIZE + ' bytes'); + this._limit = MAX_CACHE_SIZE; } /** * Tries to retrieve an element by its id. If it is not present in the cache, returns undefined; if it is present, * then the value is returned and the entry is moved to the top of the cache - * @param {String} id The block cache entry key + * @param {String} key The block cache entry key (byte offset + '' + file.id) * @returns {Uint8Array|undefined} The requested cache data or undefined */ - LRUCache.prototype.get = function(id) { - var entry = this._entries[id]; + LRUCache.prototype.get = function (key) { + var entry = this._entries.get(key); if (entry === undefined) { return entry; } @@ -66,30 +79,31 @@ define(['q'], function(Q) { /** * Stores a value in the cache by id and prunes the least recently used entry if the cache is larger than MAX_CACHE_SIZE - * @param {String} id The key under which to store the value (consists of filename + file number) - * @param {Uint16Array} value The value to store in the cache + * @param {String} key The key under which to store the value (byte offset + '' + file.id from start of ZIM archive) + * @param {Uint8Array} value The value to store in the cache */ - LRUCache.prototype.store = function(id, value) { - var entry = this._entries[id]; + LRUCache.prototype.store = function (key, value) { + var entry = this.get(key); if (entry === undefined) { - entry = this._entries[id] = {id: id, prev: null, next: null, value: value}; + entry = { + id: key, + prev: null, + next: null, + value: value + }; + this._entries.set(key, entry); this.insertAtTop(entry); - if (this._size >= this._limit) { + if (this._entries.size >= this._limit) { var e = this._last; this.unlink(e); - delete this._entries[e.id]; - } else { - this._size++; + this._entries.delete(e.id); } - } else { - entry.value = value; - this.moveToTop(entry); } }; /** * Delete a cache entry - * @param {String} entry The entry to delete + * @param {CacheEntry} entry The entry to delete */ LRUCache.prototype.unlink = function(entry) { if (entry.next === null) { @@ -106,7 +120,7 @@ define(['q'], function(Q) { /** * Insert a cache entry at the top of the cache - * @param {String} entry The entry to insert + * @param {CacheEntry} entry The entry to insert */ LRUCache.prototype.insertAtTop = function(entry) { if (this._first === null) { @@ -120,27 +134,42 @@ define(['q'], function(Q) { /** * Move a cache entry to the top of the cache - * @param {String} entry The entry to move + * @param {CacheEntry} entry The entry to move */ LRUCache.prototype.moveToTop = function(entry) { this.unlink(entry); this.insertAtTop(entry); }; - // Create a new cache - var cache = new LRUCache(MAX_CACHE_SIZE); + /** + + * A new Block Cache + * @type {BlockCache} + */ + var cache = new LRUCache(); // Counters for reporting only var hits = 0; var misses = 0; + /** + * Initializes or resets the cache - this should be called whenever a new ZIM is loaded + */ + var init = function () { + console.log('Initialize or reset FileCache'); + cache._entries = new Map(); + // Initialize linked list of entries + cache._first = null; + cache._last = null; + }; /** * Read a certain byte range in the given file, breaking the range into chunks that go through the cache - * If a read of more than blocksize (bytes) is requested, do not use the cache + * If a read of more than BLOCK_SIZE * 2 (bytes) is requested, do not use the cache * @param {Object} file The requested ZIM archive to read from * @param {Integer} begin The byte from which to start reading * @param {Integer} end The byte at which to stop reading (end will not be read) - * @return {Promise} A Promise that resolves to the correctly concatenated data from the split ZIM file set + * @return {Promise} A Promise that resolves to the correctly concatenated data from the cache + * or from the ZIM archive */ var read = function(file, begin, end) { // Read large chunks bypassing the block cache because we would have to @@ -148,19 +177,21 @@ define(['q'], function(Q) { if (end - begin > BLOCK_SIZE * 2) return file._readSplitSlice(begin, end); var readRequests = []; var blocks = {}; - for (var i = Math.floor(begin / BLOCK_SIZE) * BLOCK_SIZE; i < end; i += BLOCK_SIZE) { - var block = cache.get(file.name + i); + // Look for the requested data in the blocks: we may need to stitch together data from two or more blocks + for (var id = Math.floor(begin / BLOCK_SIZE) * BLOCK_SIZE; id < end; id += BLOCK_SIZE) { + var block = cache.get(id + '' + file.id); if (block === undefined) { + // Data not in cache, so read from archive misses++; readRequests.push(function(offset) { return file._readSplitSlice(offset, offset + BLOCK_SIZE).then(function(result) { - cache.store(file.name + offset, result); + cache.store(offset + '' + file.id, result); blocks[offset] = result; }); - }(i)); + }(id)); } else { hits++; - blocks[i] = block; + blocks[id] = block; } } if (misses + hits > 2000) { @@ -168,9 +199,11 @@ define(['q'], function(Q) { hits = 0; misses = 0; } + // Wait for all the blocks to be read either from the cache or from the archive return Q.all(readRequests).then(function() { var result = new Uint8Array(end - begin); var pos = 0; + // Stitch together the data parts in the right order for (var i = Math.floor(begin / BLOCK_SIZE) * BLOCK_SIZE; i < end; i += BLOCK_SIZE) { var b = Math.max(i, begin) - i; var e = Math.min(end, i + BLOCK_SIZE) - i; @@ -182,6 +215,7 @@ define(['q'], function(Q) { }; return { - read: read + read: read, + init: init }; }); \ No newline at end of file diff --git a/www/js/lib/zimfile.js b/www/js/lib/zimfile.js index 162a8735..d3514d99 100644 --- a/www/js/lib/zimfile.js +++ b/www/js/lib/zimfile.js @@ -22,6 +22,13 @@ 'use strict'; define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry', 'filecache'], function(xz, zstd, util, utf8, Q, zimDirEntry, FileCache) { + /** + * A variable to keep track of the currently loaded ZIM archive, e.g., for labelling cache entries + * The ID is temporary and is reset to 0 at each session start; it is incremented by 1 each time a new ZIM is loaded + * @type {Integer} + */ + var tempFileId = 0; + var readInt = function (data, offset, size) { var r = 0; for (var i = 0; i < size; i++) { @@ -38,15 +45,15 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry', * * @typedef ZIMFile * @property {Array} _files Array of ZIM files - * @property {String} name Abstract name of ZIM file set - * @property {Integer} articleCount total number of articles - * @property {Integer} clusterCount total number of clusters - * @property {Integer} urlPtrPos position of the directory pointerlist ordered by URL - * @property {Integer} titlePtrPos position of the directory pointerlist ordered by title - * @property {Integer} clusterPtrPos position of the cluster pointer list - * @property {Integer} mimeListPos position of the MIME type list (also header size) - * @property {Integer} mainPage main page or 0xffffffff if no main page - * @property {Integer} layoutPage layout page or 0xffffffffff if no layout page + * @property {Integer} id Arbitrary numeric ZIM id used to track the currently loaded archive + * @property {Integer} articleCount Total number of articles + * @property {Integer} clusterCount Total number of clusters + * @property {Integer} urlPtrPos Position of the directory pointerlist ordered by URL + * @property {Integer} titlePtrPos Position of the directory pointerlist ordered by title + * @property {Integer} clusterPtrPos Position of the cluster pointer list + * @property {Integer} mimeListPos Position of the MIME type list (also header size) + * @property {Integer} mainPage Main page or 0xffffffff if no main page + * @property {Integer} layoutPage Layout page or 0xffffffffff if no layout page */ /** @@ -70,7 +77,7 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry', }; /** - * Read a slice from the ZIM set starting at offset for size of bytes + * Read a slice from the FileCache or ZIM set, starting at offset for size of bytes * @param {Integer} offset The absolute offset from the start of the ZIM file or file set at which to start reading * @param {Integer} size The number of bytes to read * @returns {Promise} A Promise for a Uint8Array containing the requested data @@ -105,7 +112,6 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry', return readRequests[0]; } else { // Wait until all are resolved and concatenate. - console.log("CONCAT"); return Q.all(readRequests).then(function(arrays) { var concatenated = new Uint8Array(end - begin); var offset = 0; @@ -119,7 +125,7 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry', }; /** - * Read and parse a a Directory Entry at the given archive offset + * Read and parse a Directory Entry at the given archive offset * @param {Integer} offset The offset at which the DirEntry is located * @returns {Promise} A Promise for the requested DirEntry */ @@ -279,9 +285,8 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry', var urlPtrPos = readInt(header, 32, 8); return readMimetypeMap(fileArray[0], mimeListPos, urlPtrPos).then(function (data) { var zf = new ZIMFile(fileArray); - // Line below provides an abstracted filename in case the ZIM file is split into multiple parts; - // it greatly simplifies coding of the block cache, as it can store and respond to offsets from the start of the file set - zf.name = fileArray[0].name.replace(/(\.zim)\w\w$/i, '$1'); + // Line below provides a temporary, per-session numeric ZIM ID used in filecache.js + zf.id = tempFileId++; zf.articleCount = readInt(header, 24, 4); zf.clusterCount = readInt(header, 28, 4); zf.urlPtrPos = urlPtrPos; @@ -291,6 +296,8 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry', zf.mainPage = readInt(header, 64, 4); zf.layoutPage = readInt(header, 68, 4); zf.mimeTypes = data; + // Initialize or reset the FileCache + FileCache.init(); return zf; }); });