From 9430603001b5a76b34ef2680b90b8d1f91c1ea71 Mon Sep 17 00:00:00 2001 From: Peter Date: Fri, 14 Feb 2014 02:14:40 +0100 Subject: [PATCH] Added title iterator as preparation for infix search. --- www/js/lib/archive.js | 214 +++++++++++------------------------ www/js/lib/title.js | 2 +- www/js/lib/titleIterators.js | 121 ++++++++++++++++++++ www/js/lib/util.js | 25 +++- 4 files changed, 212 insertions(+), 150 deletions(-) create mode 100644 www/js/lib/titleIterators.js diff --git a/www/js/lib/archive.js b/www/js/lib/archive.js index 6a007ef6..9cbe6795 100644 --- a/www/js/lib/archive.js +++ b/www/js/lib/archive.js @@ -29,6 +29,7 @@ define(function(require) { var util = require('util'); var geometry = require('geometry'); var jQuery = require('jquery'); + var titleIterators = require('titleIterators'); // Declare the webworker that can uncompress with bzip2 algorithm var webworkerBzip2 = new Worker("js/lib/webworker_bzip2.js"); @@ -303,114 +304,30 @@ define(function(require) { }); }; - /** - * This function is recursively called after each asynchronous read, so that - * to find the closest index in titleFile to the given prefix - * When found, call the callbackFunction with the index - * - * @param reader - * @param normalizedPrefix - * @param lo - * @param hi - * @param callbackFunction - */ - LocalArchive.prototype.recursivePrefixSearch = function(reader, normalizedPrefix, lo, hi, callbackFunction) { - if (lo < hi - 1) { - var mid = Math.floor((lo + hi) / 2); - var blob = this.titleFile.slice(mid, mid + MAX_TITLE_LENGTH); - var currentLocalArchiveInstance = this; - reader.onload = function(e) { - var binaryTitleFile = e.target.result; - var byteArray = new Uint8Array(binaryTitleFile); - // Look for the index of the next NewLine - var newLineIndex = 0; - while (newLineIndex < byteArray.length && byteArray[newLineIndex] !== 10) { - newLineIndex++; - } - var startIndex = 0; - if (mid > 0) { - startIndex = newLineIndex + 16; - newLineIndex = startIndex; - // Look for the index of the next NewLine - while (newLineIndex < byteArray.length && byteArray[newLineIndex] !== 10) { - newLineIndex++; - } - } - if (newLineIndex === startIndex) { - // End of file reached - hi = mid; - } - else { - var normalizedTitle = currentLocalArchiveInstance.normalizeStringIfCompatibleArchive( - utf8.parse(byteArray.subarray(startIndex, newLineIndex))); - if (normalizedTitle < normalizedPrefix) { - lo = mid + newLineIndex - 1; - } - else { - hi = mid; - } - } - currentLocalArchiveInstance.recursivePrefixSearch(reader, normalizedPrefix, lo, hi, callbackFunction); - }; - // Read the file as a binary string - reader.readAsArrayBuffer(blob); - } - else { - if (lo > 0) { - // Let lo point to the start of an entry - lo++; - lo++; - } - // We found the closest title at index lo - callbackFunction(lo); - } - }; - /** * Read the titles in the title file starting at the given offset (maximum titleCount), and call the callbackFunction with this list of Title instances - * @param titleOffset + * @param titleOffset offset into the title file - it has to point excatly + * to the start of a title entry * @param titleCount maximum number of titles to retrieve * @param callbackFunction */ LocalArchive.prototype.getTitlesStartingAtOffset = function(titleOffset, titleCount, callbackFunction) { - var reader = new FileReader(); - reader.onerror = errorHandler; - reader.onabort = function(e) { - alert('Title file read cancelled'); - }; - - var currentLocalArchiveInstance = this; - reader.onload = function(e) { - var binaryTitleFile = e.target.result; - var byteArray = new Uint8Array(binaryTitleFile); - var i = 0; - var newLineIndex = 0; - var titleNumber = 0; - var titleList = new Array(); - while (i < byteArray.length && titleNumber < titleCount) { - // Look for the index of the next NewLine - newLineIndex += 15; - while (newLineIndex < byteArray.length && byteArray[newLineIndex] != 10) { - newLineIndex++; + var titles = []; + jQuery.when().then(function() { + var iterator = new titleIterators.SequentialTitleIterator(this, titleOffset); + function addNext() { + if (titles.length >= titleCount) { + return titles; } - - // Copy the encodedTitle in a new Array - var encodedTitle = new Uint8Array(newLineIndex - i); - for (var j = 0; j < newLineIndex - i; j++) { - encodedTitle[j] = byteArray[i + j]; - } - - var title = evopediaTitle.Title.parseTitle(encodedTitle, currentLocalArchiveInstance, i); - - titleList[titleNumber] = title; - titleNumber++; - i = newLineIndex + 1; + return iterator.advance().then(function(title) { + if (title == null) + return titles; + titles.push(title); + return addNext(); + }); } - callbackFunction(titleList); - }; - var blob = this.titleFile.slice(titleOffset, titleOffset + titleCount * MAX_TITLE_LENGTH); - // Read in the file as a binary string - reader.readAsArrayBuffer(blob); + return addNext(); + }).then(callbackFunction, errorHandler); }; /** @@ -420,30 +337,23 @@ define(function(require) { * @param callbackFunction */ LocalArchive.prototype.getTitleByName = function(titleName, callbackFunction) { - var titleFileSize = this.titleFile.size; - var reader = new FileReader(); - reader.onerror = errorHandler; - reader.onabort = function(e) { - alert('Title file read cancelled'); - }; - var currentLocalArchiveInstance = this; - var normalizedTitleName = currentLocalArchiveInstance.normalizeStringIfCompatibleArchive(titleName); - this.recursivePrefixSearch(reader, normalizedTitleName, 0, titleFileSize, function(titleOffset) { - currentLocalArchiveInstance.getTitlesStartingAtOffset(titleOffset, MAX_TITLES_WITH_SAME_NORMALIZED_NAME, function(titleList) { - if (titleList !== null && titleList.length>0) { - for (var i=0; i= maxSize) { + return titles; } - callbackFunction(titleList.slice(0, i)); - }); - }); + return iterator.advance().then(function(title) { + if (title == null) + return titles; + // check whether this title really starts with the prefix + var name = normalize(title.name); + if (name.length < prefix.length || name.substring(0, prefix.length) != prefix) + return titles; + titles.push(title); + return addNext(); + }); + } + return addNext(); + }).then(callbackFunction, errorHandler); }; @@ -950,6 +858,18 @@ define(function(require) { } }; + /** + * Returns a function that normalizes strings if the current archive is compatible. + * If it is not, returns the identity function. + */ + LocalArchive.prototype.getNormalizeFunction = function() { + if (this.normalizedTitles === true) { + return normalize_string.normalizeString; + } else { + return function(string) { return string; } + } + }; + /** * ErrorHandler for FileReader * @param {type} evt diff --git a/www/js/lib/title.js b/www/js/lib/title.js index 8678fdb7..3299f53e 100644 --- a/www/js/lib/title.js +++ b/www/js/lib/title.js @@ -147,4 +147,4 @@ define(function(require) { return { Title: Title }; -}); \ No newline at end of file +}); diff --git a/www/js/lib/titleIterators.js b/www/js/lib/titleIterators.js new file mode 100644 index 00000000..d70432da --- /dev/null +++ b/www/js/lib/titleIterators.js @@ -0,0 +1,121 @@ +/** + * titleIterators.js : Various classes to iterate over titles, for example as a + * result of searching. + * + * Copyright 2014 Evopedia developers + * License GPL v3: + * + * This file is part of Evopedia. + * + * Evopedia is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Evopedia is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Evopedia (file LICENSE-GPLv3.txt). If not, see + */ +define(['utf8', 'title', 'util', 'jquery'], function(utf8, evopediaTitle, util, jQuery) { + // Maximum length of a title + // 300 bytes is arbitrary : we actually do not really know how long the titles will be + // But mediawiki titles seem to be limited to ~200 bytes, so 300 should be more than enough + var MAX_TITLE_LENGTH = 300; + + /** + * Iterates over all titles starting at the given offset. + * The asynchronous method advance has to be called before this.title is + * valid. + */ + function SequentialTitleIterator(archive, offset) { + this._titleFile = archive.titleFile; + this._archive = archive; + this._offset = offset; + this.title = null; + } + /** + * Advances to the next title (or the first), if possible. + * @returns jQuery promise containing the next title or null if there is no + * next title + */ + SequentialTitleIterator.prototype.advance = function() { + if (this._offset >= this._titleFile.size) { + this.title = null; + return jQuery.when(this.title); + } + var that = this; + return util.readFileSlice(this._titleFile, this._offset, + this._offset + MAX_TITLE_LENGTH).then(function(byteArray) { + var newLineIndex = 15; + while (newLineIndex < byteArray.length && byteArray[newLineIndex] != 10) { + newLineIndex++; + } + var encodedTitle = byteArray.subarray(0, newLineIndex); + that._title = evopediaTitle.Title.parseTitle(encodedTitle, that._archive, that._offset); + that._offset += newLineIndex + 1; + return that._title; + }); + } + + /** + * Searches for the offset into the given title file where the first title + * with the given prefix (or lexicographically larger) is located. + * The given function normalize is applied to every title before comparison. + * @returns jQuery promise giving the offset + */ + function FindPrefixOffset(titleFile, prefix, normalize) { + prefix = normalize(prefix); + var lo = 0; + var hi = titleFile.size; + var iterate = function() { + if (lo >= hi) { + if (lo > 0) + lo += 2; // Let lo point to the start of an entry + return jQuery.when(lo); + } else { + var mid = Math.floor((lo + hi) / 2); + return util.readFileSlice(titleFile, mid, mid + MAX_TITLE_LENGTH).then(function(byteArray) { + // Look for the index of the next NewLine + var newLineIndex = 0; + while (newLineIndex < byteArray.length && byteArray[newLineIndex] !== 10) { + newLineIndex++; + } + var startIndex = 0; + if (mid > 0) { + startIndex = newLineIndex + 16; + newLineIndex = startIndex; + // Look for the index of the next NewLine + while (newLineIndex < byteArray.length && byteArray[newLineIndex] !== 10) { + newLineIndex++; + } + } + if (newLineIndex === startIndex) { + // End of file reached + hi = mid; + } else { + var normalizedTitle = normalize(utf8.parse(byteArray.subarray(startIndex, newLineIndex))); + if (normalizedTitle < prefix) { + lo = mid + newLineIndex - 1; + } else { + hi = mid; + } + } + return iterate(); + }); + } + } + return iterate(); + } + + /** + * Functions and classes exposed by this module + */ + return { + SequentialTitleIterator : SequentialTitleIterator, + FindPrefixOffset : FindPrefixOffset + }; +}); diff --git a/www/js/lib/util.js b/www/js/lib/util.js index 2e819d29..b1506009 100644 --- a/www/js/lib/util.js +++ b/www/js/lib/util.js @@ -20,6 +20,7 @@ * along with Evopedia (file LICENSE-GPLv3.txt). If not, see */ define(function(require) { + var jQuery = require('jquery'); /** * Utility function : return true if the given string ends with the suffix @@ -112,6 +113,25 @@ define(function(require) { return (r > 0 ? enc.slice(0, r - 3) : enc) + '==='.slice(r || 3); } + + /** + * Reads a Uint8Array from the given file starting at byte offset begin and + * not including byte offset end. + * @returns jQuery promise + */ + function readFileSlice(file, begin, end) { + var deferred = jQuery.Deferred(); + var reader = new FileReader(); + reader.onload = function(e) { + deferred.resolve(new Uint8Array(e.target.result)); + } + reader.onerror = reader.onabort = function(e) { + deferred.reject(e); + } + reader.readAsArrayBuffer(file.slice(begin, end)); + return deferred.promise(); + } + /** * Functions and classes exposed by this module @@ -122,6 +142,7 @@ define(function(require) { readIntegerFrom2Bytes : readIntegerFrom2Bytes, readFloatFrom4Bytes : readFloatFrom4Bytes, uint8ArrayToHex : uint8ArrayToHex, - uint8ArrayToBase64 : uint8ArrayToBase64 + uint8ArrayToBase64 : uint8ArrayToBase64, + readFileSlice : readFileSlice }; -}); \ No newline at end of file +});