Port proposed caching changes from Kiwix JS

Former-commit-id: 727c78557a98457d622955b2c1b30a909f6e5a89 [formerly 3820e67fdd6c756ba92b0b4fdee9a0c6a5bc805b [formerly 427a6213b23abc46e869ed8d5f061691cd7cedad]]
Former-commit-id: 0c9e50cac31aa043c2bb3d19c1907f3cbc819de7
Former-commit-id: 94d9599a6683a00eb486a46bf98edf797e985301
This commit is contained in:
Jaifroid 2020-10-31 18:44:24 +00:00
parent d797531593
commit 58944df5db
2 changed files with 96 additions and 55 deletions

View File

@ -1,5 +1,7 @@
/** /**
* filecache.js: Generic least-recently-used-cache used for reading file chunks. * filecache.js: Generic cache for small, frequently read file slices.
* It discards cached blocks according to a least-recently-used algorithm.
* It is used primarily for fast Directory Entry lookup, speeding up binary search.
* *
* Copyright 2020 Mossroy, peter-x, jaifroid and contributors * Copyright 2020 Mossroy, peter-x, jaifroid and contributors
* License GPL v3: * License GPL v3:
@ -35,28 +37,39 @@ define(['q'], function(Q) {
const BLOCK_SIZE = 4096; const BLOCK_SIZE = 4096;
/** /**
* Creates a new cache with max size limit * A Cache Entry
* @param {Integer} limit The maximum number of blocks of BLOCK_SIZE to be cached * @typedef CacheEntry
* @property {String} id The cache key (stored also in the entry)
* @property {CacheEntry} prev The previous linked cache entry
* @property {CacheEntry} next The next linked cache entry
* @property {Uint8Array} value The cached data
*/ */
function LRUCache(limit) {
console.log("Creating cache of size " + limit); /**
this._limit = limit; * A Block Cache employing a Least Recently Used caching strategy
this._size = 0; * @typedef BlockCache
// Mapping from id to {value: , prev: , next: } * @property {Integer} _limit The maximum number of entries in the cache
this._entries = {}; * @property {Map} _entries A map to store the cache keys and data
// linked list of entries * @property {CacheEntry} _first The most recent entry in the cache
this._first = null; * @property {CacheEntry} _last The least recedntly used entry in the cache
this._last = null; */
/**
* Creates a new cache with max size limit of MAX_CACHE_SIZE blocks
*/
function LRUCache() {
console.log('Creating cache of size ' + MAX_CACHE_SIZE + ' * ' + BLOCK_SIZE + ' bytes');
this._limit = MAX_CACHE_SIZE;
} }
/** /**
* Tries to retrieve an element by its id. If it is not present in the cache, returns undefined; if it is present, * Tries to retrieve an element by its id. If it is not present in the cache, returns undefined; if it is present,
* then the value is returned and the entry is moved to the top of the cache * then the value is returned and the entry is moved to the top of the cache
* @param {String} id The block cache entry key * @param {String} key The block cache entry key (byte offset + '' + file.id)
* @returns {Uint8Array|undefined} The requested cache data or undefined * @returns {Uint8Array|undefined} The requested cache data or undefined
*/ */
LRUCache.prototype.get = function(id) { LRUCache.prototype.get = function (key) {
var entry = this._entries[id]; var entry = this._entries.get(key);
if (entry === undefined) { if (entry === undefined) {
return entry; return entry;
} }
@ -66,30 +79,31 @@ define(['q'], function(Q) {
/** /**
* Stores a value in the cache by id and prunes the least recently used entry if the cache is larger than MAX_CACHE_SIZE * Stores a value in the cache by id and prunes the least recently used entry if the cache is larger than MAX_CACHE_SIZE
* @param {String} id The key under which to store the value (consists of filename + file number) * @param {String} key The key under which to store the value (byte offset + '' + file.id from start of ZIM archive)
* @param {Uint16Array} value The value to store in the cache * @param {Uint8Array} value The value to store in the cache
*/ */
LRUCache.prototype.store = function(id, value) { LRUCache.prototype.store = function (key, value) {
var entry = this._entries[id]; var entry = this.get(key);
if (entry === undefined) { if (entry === undefined) {
entry = this._entries[id] = {id: id, prev: null, next: null, value: value}; entry = {
id: key,
prev: null,
next: null,
value: value
};
this._entries.set(key, entry);
this.insertAtTop(entry); this.insertAtTop(entry);
if (this._size >= this._limit) { if (this._entries.size >= this._limit) {
var e = this._last; var e = this._last;
this.unlink(e); this.unlink(e);
delete this._entries[e.id]; this._entries.delete(e.id);
} else {
this._size++;
} }
} else {
entry.value = value;
this.moveToTop(entry);
} }
}; };
/** /**
* Delete a cache entry * Delete a cache entry
* @param {String} entry The entry to delete * @param {CacheEntry} entry The entry to delete
*/ */
LRUCache.prototype.unlink = function(entry) { LRUCache.prototype.unlink = function(entry) {
if (entry.next === null) { if (entry.next === null) {
@ -106,7 +120,7 @@ define(['q'], function(Q) {
/** /**
* Insert a cache entry at the top of the cache * Insert a cache entry at the top of the cache
* @param {String} entry The entry to insert * @param {CacheEntry} entry The entry to insert
*/ */
LRUCache.prototype.insertAtTop = function(entry) { LRUCache.prototype.insertAtTop = function(entry) {
if (this._first === null) { if (this._first === null) {
@ -120,27 +134,42 @@ define(['q'], function(Q) {
/** /**
* Move a cache entry to the top of the cache * Move a cache entry to the top of the cache
* @param {String} entry The entry to move * @param {CacheEntry} entry The entry to move
*/ */
LRUCache.prototype.moveToTop = function(entry) { LRUCache.prototype.moveToTop = function(entry) {
this.unlink(entry); this.unlink(entry);
this.insertAtTop(entry); this.insertAtTop(entry);
}; };
// Create a new cache /**
var cache = new LRUCache(MAX_CACHE_SIZE);
* A new Block Cache
* @type {BlockCache}
*/
var cache = new LRUCache();
// Counters for reporting only // Counters for reporting only
var hits = 0; var hits = 0;
var misses = 0; var misses = 0;
/**
* Initializes or resets the cache - this should be called whenever a new ZIM is loaded
*/
var init = function () {
console.log('Initialize or reset FileCache');
cache._entries = new Map();
// Initialize linked list of entries
cache._first = null;
cache._last = null;
};
/** /**
* Read a certain byte range in the given file, breaking the range into chunks that go through the cache * Read a certain byte range in the given file, breaking the range into chunks that go through the cache
* If a read of more than blocksize (bytes) is requested, do not use the cache * If a read of more than BLOCK_SIZE * 2 (bytes) is requested, do not use the cache
* @param {Object} file The requested ZIM archive to read from * @param {Object} file The requested ZIM archive to read from
* @param {Integer} begin The byte from which to start reading * @param {Integer} begin The byte from which to start reading
* @param {Integer} end The byte at which to stop reading (end will not be read) * @param {Integer} end The byte at which to stop reading (end will not be read)
* @return {Promise<Uint8Array>} A Promise that resolves to the correctly concatenated data from the split ZIM file set * @return {Promise<Uint8Array>} A Promise that resolves to the correctly concatenated data from the cache
* or from the ZIM archive
*/ */
var read = function(file, begin, end) { var read = function(file, begin, end) {
// Read large chunks bypassing the block cache because we would have to // Read large chunks bypassing the block cache because we would have to
@ -148,19 +177,21 @@ define(['q'], function(Q) {
if (end - begin > BLOCK_SIZE * 2) return file._readSplitSlice(begin, end); if (end - begin > BLOCK_SIZE * 2) return file._readSplitSlice(begin, end);
var readRequests = []; var readRequests = [];
var blocks = {}; var blocks = {};
for (var i = Math.floor(begin / BLOCK_SIZE) * BLOCK_SIZE; i < end; i += BLOCK_SIZE) { // Look for the requested data in the blocks: we may need to stitch together data from two or more blocks
var block = cache.get(file.name + i); for (var id = Math.floor(begin / BLOCK_SIZE) * BLOCK_SIZE; id < end; id += BLOCK_SIZE) {
var block = cache.get(id + '' + file.id);
if (block === undefined) { if (block === undefined) {
// Data not in cache, so read from archive
misses++; misses++;
readRequests.push(function(offset) { readRequests.push(function(offset) {
return file._readSplitSlice(offset, offset + BLOCK_SIZE).then(function(result) { return file._readSplitSlice(offset, offset + BLOCK_SIZE).then(function(result) {
cache.store(file.name + offset, result); cache.store(offset + '' + file.id, result);
blocks[offset] = result; blocks[offset] = result;
}); });
}(i)); }(id));
} else { } else {
hits++; hits++;
blocks[i] = block; blocks[id] = block;
} }
} }
if (misses + hits > 2000) { if (misses + hits > 2000) {
@ -168,9 +199,11 @@ define(['q'], function(Q) {
hits = 0; hits = 0;
misses = 0; misses = 0;
} }
// Wait for all the blocks to be read either from the cache or from the archive
return Q.all(readRequests).then(function() { return Q.all(readRequests).then(function() {
var result = new Uint8Array(end - begin); var result = new Uint8Array(end - begin);
var pos = 0; var pos = 0;
// Stitch together the data parts in the right order
for (var i = Math.floor(begin / BLOCK_SIZE) * BLOCK_SIZE; i < end; i += BLOCK_SIZE) { for (var i = Math.floor(begin / BLOCK_SIZE) * BLOCK_SIZE; i < end; i += BLOCK_SIZE) {
var b = Math.max(i, begin) - i; var b = Math.max(i, begin) - i;
var e = Math.min(end, i + BLOCK_SIZE) - i; var e = Math.min(end, i + BLOCK_SIZE) - i;
@ -182,6 +215,7 @@ define(['q'], function(Q) {
}; };
return { return {
read: read read: read,
init: init
}; };
}); });

View File

@ -22,6 +22,13 @@
'use strict'; 'use strict';
define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry', 'filecache'], function(xz, zstd, util, utf8, Q, zimDirEntry, FileCache) { define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry', 'filecache'], function(xz, zstd, util, utf8, Q, zimDirEntry, FileCache) {
/**
* A variable to keep track of the currently loaded ZIM archive, e.g., for labelling cache entries
* The ID is temporary and is reset to 0 at each session start; it is incremented by 1 each time a new ZIM is loaded
* @type {Integer}
*/
var tempFileId = 0;
var readInt = function (data, offset, size) { var readInt = function (data, offset, size) {
var r = 0; var r = 0;
for (var i = 0; i < size; i++) { for (var i = 0; i < size; i++) {
@ -38,15 +45,15 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
* *
* @typedef ZIMFile * @typedef ZIMFile
* @property {Array<File>} _files Array of ZIM files * @property {Array<File>} _files Array of ZIM files
* @property {String} name Abstract name of ZIM file set * @property {Integer} id Arbitrary numeric ZIM id used to track the currently loaded archive
* @property {Integer} articleCount total number of articles * @property {Integer} articleCount Total number of articles
* @property {Integer} clusterCount total number of clusters * @property {Integer} clusterCount Total number of clusters
* @property {Integer} urlPtrPos position of the directory pointerlist ordered by URL * @property {Integer} urlPtrPos Position of the directory pointerlist ordered by URL
* @property {Integer} titlePtrPos position of the directory pointerlist ordered by title * @property {Integer} titlePtrPos Position of the directory pointerlist ordered by title
* @property {Integer} clusterPtrPos position of the cluster pointer list * @property {Integer} clusterPtrPos Position of the cluster pointer list
* @property {Integer} mimeListPos position of the MIME type list (also header size) * @property {Integer} mimeListPos Position of the MIME type list (also header size)
* @property {Integer} mainPage main page or 0xffffffff if no main page * @property {Integer} mainPage Main page or 0xffffffff if no main page
* @property {Integer} layoutPage layout page or 0xffffffffff if no layout page * @property {Integer} layoutPage Layout page or 0xffffffffff if no layout page
*/ */
/** /**
@ -70,7 +77,7 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
}; };
/** /**
* Read a slice from the ZIM set starting at offset for size of bytes * Read a slice from the FileCache or ZIM set, starting at offset for size of bytes
* @param {Integer} offset The absolute offset from the start of the ZIM file or file set at which to start reading * @param {Integer} offset The absolute offset from the start of the ZIM file or file set at which to start reading
* @param {Integer} size The number of bytes to read * @param {Integer} size The number of bytes to read
* @returns {Promise<Uint8Array>} A Promise for a Uint8Array containing the requested data * @returns {Promise<Uint8Array>} A Promise for a Uint8Array containing the requested data
@ -105,7 +112,6 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
return readRequests[0]; return readRequests[0];
} else { } else {
// Wait until all are resolved and concatenate. // Wait until all are resolved and concatenate.
console.log("CONCAT");
return Q.all(readRequests).then(function(arrays) { return Q.all(readRequests).then(function(arrays) {
var concatenated = new Uint8Array(end - begin); var concatenated = new Uint8Array(end - begin);
var offset = 0; var offset = 0;
@ -119,7 +125,7 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
}; };
/** /**
* Read and parse a a Directory Entry at the given archive offset * Read and parse a Directory Entry at the given archive offset
* @param {Integer} offset The offset at which the DirEntry is located * @param {Integer} offset The offset at which the DirEntry is located
* @returns {Promise<DirEntry>} A Promise for the requested DirEntry * @returns {Promise<DirEntry>} A Promise for the requested DirEntry
*/ */
@ -279,9 +285,8 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
var urlPtrPos = readInt(header, 32, 8); var urlPtrPos = readInt(header, 32, 8);
return readMimetypeMap(fileArray[0], mimeListPos, urlPtrPos).then(function (data) { return readMimetypeMap(fileArray[0], mimeListPos, urlPtrPos).then(function (data) {
var zf = new ZIMFile(fileArray); var zf = new ZIMFile(fileArray);
// Line below provides an abstracted filename in case the ZIM file is split into multiple parts; // Line below provides a temporary, per-session numeric ZIM ID used in filecache.js
// it greatly simplifies coding of the block cache, as it can store and respond to offsets from the start of the file set zf.id = tempFileId++;
zf.name = fileArray[0].name.replace(/(\.zim)\w\w$/i, '$1');
zf.articleCount = readInt(header, 24, 4); zf.articleCount = readInt(header, 24, 4);
zf.clusterCount = readInt(header, 28, 4); zf.clusterCount = readInt(header, 28, 4);
zf.urlPtrPos = urlPtrPos; zf.urlPtrPos = urlPtrPos;
@ -291,6 +296,8 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
zf.mainPage = readInt(header, 64, 4); zf.mainPage = readInt(header, 64, 4);
zf.layoutPage = readInt(header, 68, 4); zf.layoutPage = readInt(header, 68, 4);
zf.mimeTypes = data; zf.mimeTypes = data;
// Initialize or reset the FileCache
FileCache.init();
return zf; return zf;
}); });
}); });