Added title iterator as preparation for infix search.

This commit is contained in:
Peter 2014-02-14 02:14:40 +01:00
parent c592027d1d
commit 9430603001
4 changed files with 212 additions and 150 deletions

View File

@ -29,6 +29,7 @@ define(function(require) {
var util = require('util');
var geometry = require('geometry');
var jQuery = require('jquery');
var titleIterators = require('titleIterators');
// Declare the webworker that can uncompress with bzip2 algorithm
var webworkerBzip2 = new Worker("js/lib/webworker_bzip2.js");
@ -303,114 +304,30 @@ define(function(require) {
});
};
/**
* This function is recursively called after each asynchronous read, so that
* to find the closest index in titleFile to the given prefix
* When found, call the callbackFunction with the index
*
* @param reader
* @param normalizedPrefix
* @param lo
* @param hi
* @param callbackFunction
*/
LocalArchive.prototype.recursivePrefixSearch = function(reader, normalizedPrefix, lo, hi, callbackFunction) {
if (lo < hi - 1) {
var mid = Math.floor((lo + hi) / 2);
var blob = this.titleFile.slice(mid, mid + MAX_TITLE_LENGTH);
var currentLocalArchiveInstance = this;
reader.onload = function(e) {
var binaryTitleFile = e.target.result;
var byteArray = new Uint8Array(binaryTitleFile);
// Look for the index of the next NewLine
var newLineIndex = 0;
while (newLineIndex < byteArray.length && byteArray[newLineIndex] !== 10) {
newLineIndex++;
}
var startIndex = 0;
if (mid > 0) {
startIndex = newLineIndex + 16;
newLineIndex = startIndex;
// Look for the index of the next NewLine
while (newLineIndex < byteArray.length && byteArray[newLineIndex] !== 10) {
newLineIndex++;
}
}
if (newLineIndex === startIndex) {
// End of file reached
hi = mid;
}
else {
var normalizedTitle = currentLocalArchiveInstance.normalizeStringIfCompatibleArchive(
utf8.parse(byteArray.subarray(startIndex, newLineIndex)));
if (normalizedTitle < normalizedPrefix) {
lo = mid + newLineIndex - 1;
}
else {
hi = mid;
}
}
currentLocalArchiveInstance.recursivePrefixSearch(reader, normalizedPrefix, lo, hi, callbackFunction);
};
// Read the file as a binary string
reader.readAsArrayBuffer(blob);
}
else {
if (lo > 0) {
// Let lo point to the start of an entry
lo++;
lo++;
}
// We found the closest title at index lo
callbackFunction(lo);
}
};
/**
* Read the titles in the title file starting at the given offset (maximum titleCount), and call the callbackFunction with this list of Title instances
* @param titleOffset
* @param titleOffset offset into the title file - it has to point excatly
* to the start of a title entry
* @param titleCount maximum number of titles to retrieve
* @param callbackFunction
*/
LocalArchive.prototype.getTitlesStartingAtOffset = function(titleOffset, titleCount, callbackFunction) {
var reader = new FileReader();
reader.onerror = errorHandler;
reader.onabort = function(e) {
alert('Title file read cancelled');
};
var currentLocalArchiveInstance = this;
reader.onload = function(e) {
var binaryTitleFile = e.target.result;
var byteArray = new Uint8Array(binaryTitleFile);
var i = 0;
var newLineIndex = 0;
var titleNumber = 0;
var titleList = new Array();
while (i < byteArray.length && titleNumber < titleCount) {
// Look for the index of the next NewLine
newLineIndex += 15;
while (newLineIndex < byteArray.length && byteArray[newLineIndex] != 10) {
newLineIndex++;
var titles = [];
jQuery.when().then(function() {
var iterator = new titleIterators.SequentialTitleIterator(this, titleOffset);
function addNext() {
if (titles.length >= titleCount) {
return titles;
}
// Copy the encodedTitle in a new Array
var encodedTitle = new Uint8Array(newLineIndex - i);
for (var j = 0; j < newLineIndex - i; j++) {
encodedTitle[j] = byteArray[i + j];
}
var title = evopediaTitle.Title.parseTitle(encodedTitle, currentLocalArchiveInstance, i);
titleList[titleNumber] = title;
titleNumber++;
i = newLineIndex + 1;
return iterator.advance().then(function(title) {
if (title == null)
return titles;
titles.push(title);
return addNext();
});
}
callbackFunction(titleList);
};
var blob = this.titleFile.slice(titleOffset, titleOffset + titleCount * MAX_TITLE_LENGTH);
// Read in the file as a binary string
reader.readAsArrayBuffer(blob);
return addNext();
}).then(callbackFunction, errorHandler);
};
/**
@ -420,30 +337,23 @@ define(function(require) {
* @param callbackFunction
*/
LocalArchive.prototype.getTitleByName = function(titleName, callbackFunction) {
var titleFileSize = this.titleFile.size;
var reader = new FileReader();
reader.onerror = errorHandler;
reader.onabort = function(e) {
alert('Title file read cancelled');
};
var currentLocalArchiveInstance = this;
var normalizedTitleName = currentLocalArchiveInstance.normalizeStringIfCompatibleArchive(titleName);
this.recursivePrefixSearch(reader, normalizedTitleName, 0, titleFileSize, function(titleOffset) {
currentLocalArchiveInstance.getTitlesStartingAtOffset(titleOffset, MAX_TITLES_WITH_SAME_NORMALIZED_NAME, function(titleList) {
if (titleList !== null && titleList.length>0) {
for (var i=0; i<titleList.length; i++) {
var title = titleList[i];
if (title.name === titleName) {
// The title has been found
callbackFunction(title);
return;
}
}
var that = this;
var normalize = this.getNormalizeFunction();
var normalizedTitleName = normalize(titleName);
titleIterators.FindPrefixOffset(this.titleFile, titleName, normalize).then(function(offset) {
var iterator = new titleIterators.SequentialTitleIterator(that, offset);
function check(title) {
if (title == null || normalize(title.name) !== normalizedTitleName) {
return null;
} else if (title.name === titleName) {
return title;
} else {
return iterator.advance().then(check);
}
// The title has not been found
callbackFunction(null);
});
});
}
return iterator.advance().then(check);
}).then(callbackFunction, errorHandler);
};
/**
@ -461,32 +371,30 @@ define(function(require) {
* @param callbackFunction
*/
LocalArchive.prototype.findTitlesWithPrefix = function(prefix, maxSize, callbackFunction) {
var titleFileSize = this.titleFile.size;
if (prefix) {
prefix = this.normalizeStringIfCompatibleArchive(prefix);
}
var that = this;
var titles = [];
var normalize = this.getNormalizeFunction();
prefix = normalize(prefix);
var reader = new FileReader();
reader.onerror = errorHandler;
reader.onabort = function(e) {
alert('Title file read cancelled');
};
var currentLocalArchiveInstance = this;
var normalizedPrefix = this.normalizeStringIfCompatibleArchive(prefix);
this.recursivePrefixSearch(reader, normalizedPrefix, 0, titleFileSize, function(titleOffset) {
currentLocalArchiveInstance.getTitlesStartingAtOffset(titleOffset, maxSize, function(titleList) {
// Keep only the titles with names starting with the prefix
var i = 0;
for (i = 0; i < titleList.length; i++) {
var titleName = titleList[i].name;
var normalizedTitleName = currentLocalArchiveInstance.normalizeStringIfCompatibleArchive(titleName);
if (normalizedTitleName.length < normalizedPrefix.length || normalizedTitleName.substring(0, normalizedPrefix.length) !== normalizedPrefix) {
break;
}
titleIterators.FindPrefixOffset(this.titleFile, prefix, normalize).then(function(offset) {
var iterator = new titleIterators.SequentialTitleIterator(that, offset);
function addNext() {
if (titles.length >= maxSize) {
return titles;
}
callbackFunction(titleList.slice(0, i));
});
});
return iterator.advance().then(function(title) {
if (title == null)
return titles;
// check whether this title really starts with the prefix
var name = normalize(title.name);
if (name.length < prefix.length || name.substring(0, prefix.length) != prefix)
return titles;
titles.push(title);
return addNext();
});
}
return addNext();
}).then(callbackFunction, errorHandler);
};
@ -950,6 +858,18 @@ define(function(require) {
}
};
/**
* Returns a function that normalizes strings if the current archive is compatible.
* If it is not, returns the identity function.
*/
LocalArchive.prototype.getNormalizeFunction = function() {
if (this.normalizedTitles === true) {
return normalize_string.normalizeString;
} else {
return function(string) { return string; }
}
};
/**
* ErrorHandler for FileReader
* @param {type} evt

View File

@ -0,0 +1,121 @@
/**
* titleIterators.js : Various classes to iterate over titles, for example as a
* result of searching.
*
* Copyright 2014 Evopedia developers
* License GPL v3:
*
* This file is part of Evopedia.
*
* Evopedia is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Evopedia is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Evopedia (file LICENSE-GPLv3.txt). If not, see <http://www.gnu.org/licenses/>
*/
define(['utf8', 'title', 'util', 'jquery'], function(utf8, evopediaTitle, util, jQuery) {
// Maximum length of a title
// 300 bytes is arbitrary : we actually do not really know how long the titles will be
// But mediawiki titles seem to be limited to ~200 bytes, so 300 should be more than enough
var MAX_TITLE_LENGTH = 300;
/**
* Iterates over all titles starting at the given offset.
* The asynchronous method advance has to be called before this.title is
* valid.
*/
function SequentialTitleIterator(archive, offset) {
this._titleFile = archive.titleFile;
this._archive = archive;
this._offset = offset;
this.title = null;
}
/**
* Advances to the next title (or the first), if possible.
* @returns jQuery promise containing the next title or null if there is no
* next title
*/
SequentialTitleIterator.prototype.advance = function() {
if (this._offset >= this._titleFile.size) {
this.title = null;
return jQuery.when(this.title);
}
var that = this;
return util.readFileSlice(this._titleFile, this._offset,
this._offset + MAX_TITLE_LENGTH).then(function(byteArray) {
var newLineIndex = 15;
while (newLineIndex < byteArray.length && byteArray[newLineIndex] != 10) {
newLineIndex++;
}
var encodedTitle = byteArray.subarray(0, newLineIndex);
that._title = evopediaTitle.Title.parseTitle(encodedTitle, that._archive, that._offset);
that._offset += newLineIndex + 1;
return that._title;
});
}
/**
* Searches for the offset into the given title file where the first title
* with the given prefix (or lexicographically larger) is located.
* The given function normalize is applied to every title before comparison.
* @returns jQuery promise giving the offset
*/
function FindPrefixOffset(titleFile, prefix, normalize) {
prefix = normalize(prefix);
var lo = 0;
var hi = titleFile.size;
var iterate = function() {
if (lo >= hi) {
if (lo > 0)
lo += 2; // Let lo point to the start of an entry
return jQuery.when(lo);
} else {
var mid = Math.floor((lo + hi) / 2);
return util.readFileSlice(titleFile, mid, mid + MAX_TITLE_LENGTH).then(function(byteArray) {
// Look for the index of the next NewLine
var newLineIndex = 0;
while (newLineIndex < byteArray.length && byteArray[newLineIndex] !== 10) {
newLineIndex++;
}
var startIndex = 0;
if (mid > 0) {
startIndex = newLineIndex + 16;
newLineIndex = startIndex;
// Look for the index of the next NewLine
while (newLineIndex < byteArray.length && byteArray[newLineIndex] !== 10) {
newLineIndex++;
}
}
if (newLineIndex === startIndex) {
// End of file reached
hi = mid;
} else {
var normalizedTitle = normalize(utf8.parse(byteArray.subarray(startIndex, newLineIndex)));
if (normalizedTitle < prefix) {
lo = mid + newLineIndex - 1;
} else {
hi = mid;
}
}
return iterate();
});
}
}
return iterate();
}
/**
* Functions and classes exposed by this module
*/
return {
SequentialTitleIterator : SequentialTitleIterator,
FindPrefixOffset : FindPrefixOffset
};
});

View File

@ -20,6 +20,7 @@
* along with Evopedia (file LICENSE-GPLv3.txt). If not, see <http://www.gnu.org/licenses/>
*/
define(function(require) {
var jQuery = require('jquery');
/**
* Utility function : return true if the given string ends with the suffix
@ -113,6 +114,25 @@ define(function(require) {
return (r > 0 ? enc.slice(0, r - 3) : enc) + '==='.slice(r || 3);
}
/**
* Reads a Uint8Array from the given file starting at byte offset begin and
* not including byte offset end.
* @returns jQuery promise
*/
function readFileSlice(file, begin, end) {
var deferred = jQuery.Deferred();
var reader = new FileReader();
reader.onload = function(e) {
deferred.resolve(new Uint8Array(e.target.result));
}
reader.onerror = reader.onabort = function(e) {
deferred.reject(e);
}
reader.readAsArrayBuffer(file.slice(begin, end));
return deferred.promise();
}
/**
* Functions and classes exposed by this module
*/
@ -122,6 +142,7 @@ define(function(require) {
readIntegerFrom2Bytes : readIntegerFrom2Bytes,
readFloatFrom4Bytes : readFloatFrom4Bytes,
uint8ArrayToHex : uint8ArrayToHex,
uint8ArrayToBase64 : uint8ArrayToBase64
uint8ArrayToBase64 : uint8ArrayToBase64,
readFileSlice : readFileSlice
};
});