Added title iterator as preparation for infix search.

2025-09-24 04:54:51 -04:00 · 2014-02-14 02:14:40 +01:00 · 2014-02-14 02:14:40 +01:00 · 9430603001
commit 9430603001
parent c592027d1d
4 changed files with 212 additions and 150 deletions
--- a/www/js/lib/archive.js
+++ b/www/js/lib/archive.js
@ -29,6 +29,7 @@ define(function(require) {
    var util = require('util');
    var geometry = require('geometry');
    var jQuery = require('jquery');
    var titleIterators = require('titleIterators');
    // Declare the webworker that can uncompress with bzip2 algorithm
    var webworkerBzip2 = new Worker("js/lib/webworker_bzip2.js");
@ -303,114 +304,30 @@ define(function(require) {
        });
    };
    /**
     * This function is recursively called after each asynchronous read, so that
     * to find the closest index in titleFile to the given prefix
     * When found, call the callbackFunction with the index
     * 
     * @param reader
     * @param normalizedPrefix
     * @param lo
     * @param hi
     * @param callbackFunction
     */
    LocalArchive.prototype.recursivePrefixSearch = function(reader, normalizedPrefix, lo, hi, callbackFunction) {
        if (lo < hi - 1) {
            var mid = Math.floor((lo + hi) / 2);
            var blob = this.titleFile.slice(mid, mid + MAX_TITLE_LENGTH);
            var currentLocalArchiveInstance = this;
            reader.onload = function(e) {
                var binaryTitleFile = e.target.result;
                var byteArray = new Uint8Array(binaryTitleFile);
                // Look for the index of the next NewLine
                var newLineIndex = 0;
                while (newLineIndex < byteArray.length && byteArray[newLineIndex] !== 10) {
                    newLineIndex++;
                }
                var startIndex = 0;
                if (mid > 0) {
                    startIndex = newLineIndex + 16;
                    newLineIndex = startIndex;
                    // Look for the index of the next NewLine	
                    while (newLineIndex < byteArray.length && byteArray[newLineIndex] !== 10) {
                        newLineIndex++;
                    }
                }
                if (newLineIndex === startIndex) {
                    // End of file reached
                    hi = mid;
                }
                else {
                    var normalizedTitle = currentLocalArchiveInstance.normalizeStringIfCompatibleArchive(
                            utf8.parse(byteArray.subarray(startIndex, newLineIndex)));
                    if (normalizedTitle < normalizedPrefix) {
                        lo = mid + newLineIndex - 1;
                    }
                    else {
                        hi = mid;
                    }
                }
                currentLocalArchiveInstance.recursivePrefixSearch(reader, normalizedPrefix, lo, hi, callbackFunction);
            };
            // Read the file as a binary string
            reader.readAsArrayBuffer(blob);
        }
        else {
            if (lo > 0) {
                // Let lo point to the start of an entry
                lo++;
                lo++;
            }
            // We found the closest title at index lo
            callbackFunction(lo);
        }
    };
    /**
     * Read the titles in the title file starting at the given offset (maximum titleCount), and call the callbackFunction with this list of Title instances
-     * @param titleOffset
+     * @param titleOffset offset into the title file - it has to point excatly
     *                    to the start of a title entry
     * @param titleCount maximum number of titles to retrieve
     * @param callbackFunction
     */
    LocalArchive.prototype.getTitlesStartingAtOffset = function(titleOffset, titleCount, callbackFunction) {
-        var reader = new FileReader();
+        var titles = [];
-        reader.onerror = errorHandler;
+        jQuery.when().then(function() {
-        reader.onabort = function(e) {
+            var iterator = new titleIterators.SequentialTitleIterator(this, titleOffset);
-            alert('Title file read cancelled');
+            function addNext() {
-        };
+                if (titles.length >= titleCount) {
-
+                    return titles;
        var currentLocalArchiveInstance = this;
        reader.onload = function(e) {
            var binaryTitleFile = e.target.result;
            var byteArray = new Uint8Array(binaryTitleFile);
            var i = 0;
            var newLineIndex = 0;
            var titleNumber = 0;
            var titleList = new Array();
            while (i < byteArray.length && titleNumber < titleCount) {
                // Look for the index of the next NewLine
                newLineIndex += 15;
                while (newLineIndex < byteArray.length && byteArray[newLineIndex] != 10) {
                    newLineIndex++;
                }
-
+                return iterator.advance().then(function(title) {
-                // Copy the encodedTitle in a new Array
+                    if (title == null)
-                var encodedTitle = new Uint8Array(newLineIndex - i);
+                        return titles;
-                for (var j = 0; j < newLineIndex - i; j++) {
+                    titles.push(title);
-                    encodedTitle[j] = byteArray[i + j];
+                    return addNext();
                });
            }
-
+            return addNext();
-                var title = evopediaTitle.Title.parseTitle(encodedTitle, currentLocalArchiveInstance, i);
+        }).then(callbackFunction, errorHandler);
                titleList[titleNumber] = title;
                titleNumber++;
                i = newLineIndex + 1;
            }
            callbackFunction(titleList);
        };
        var blob = this.titleFile.slice(titleOffset, titleOffset + titleCount * MAX_TITLE_LENGTH);
        // Read in the file as a binary string
        reader.readAsArrayBuffer(blob);
    };
    /**
@ -420,30 +337,23 @@ define(function(require) {
     * @param callbackFunction
     */
    LocalArchive.prototype.getTitleByName = function(titleName, callbackFunction) {
-        var titleFileSize = this.titleFile.size;
+        var that = this;
-        var reader = new FileReader();
+        var normalize = this.getNormalizeFunction();
-        reader.onerror = errorHandler;
+        var normalizedTitleName = normalize(titleName);
-        reader.onabort = function(e) {
+
-            alert('Title file read cancelled');
+        titleIterators.FindPrefixOffset(this.titleFile, titleName, normalize).then(function(offset) {
-        };
+            var iterator = new titleIterators.SequentialTitleIterator(that, offset);
-        var currentLocalArchiveInstance = this;
+            function check(title) {
-        var normalizedTitleName = currentLocalArchiveInstance.normalizeStringIfCompatibleArchive(titleName);
+                if (title == null || normalize(title.name) !== normalizedTitleName) {
-        this.recursivePrefixSearch(reader, normalizedTitleName, 0, titleFileSize, function(titleOffset) {
+                    return null;
-            currentLocalArchiveInstance.getTitlesStartingAtOffset(titleOffset, MAX_TITLES_WITH_SAME_NORMALIZED_NAME, function(titleList) {
+                } else if (title.name === titleName) {
-                if (titleList !== null && titleList.length>0) {
+                    return title;
-                    for (var i=0; i<titleList.length; i++) {
+                } else {
-                        var title = titleList[i];
+                    return iterator.advance().then(check);
                        if (title.name === titleName) {
                            // The title has been found
                            callbackFunction(title);
                            return;
                }
            }
-                }
+            return iterator.advance().then(check);
-                // The title has not been found
+        }).then(callbackFunction, errorHandler);
                callbackFunction(null);
            });
        });
    };
    /**
@ -461,32 +371,30 @@ define(function(require) {
     * @param callbackFunction
     */
    LocalArchive.prototype.findTitlesWithPrefix = function(prefix, maxSize, callbackFunction) {
-        var titleFileSize = this.titleFile.size;
+        var that = this;
-        if (prefix) {
+        var titles = [];
-            prefix = this.normalizeStringIfCompatibleArchive(prefix);
+        var normalize = this.getNormalizeFunction();
-        }
+        prefix = normalize(prefix);
-        var reader = new FileReader();
+        titleIterators.FindPrefixOffset(this.titleFile, prefix, normalize).then(function(offset) {
-        reader.onerror = errorHandler;
+            var iterator = new titleIterators.SequentialTitleIterator(that, offset);
-        reader.onabort = function(e) {
+            function addNext() {
-            alert('Title file read cancelled');
+                if (titles.length >= maxSize) {
-        };
+                    return titles;
        var currentLocalArchiveInstance = this;
        var normalizedPrefix = this.normalizeStringIfCompatibleArchive(prefix);
        this.recursivePrefixSearch(reader, normalizedPrefix, 0, titleFileSize, function(titleOffset) {
            currentLocalArchiveInstance.getTitlesStartingAtOffset(titleOffset, maxSize, function(titleList) {
                // Keep only the titles with names starting with the prefix
                var i = 0;
                for (i = 0; i < titleList.length; i++) {
                    var titleName = titleList[i].name;
                    var normalizedTitleName = currentLocalArchiveInstance.normalizeStringIfCompatibleArchive(titleName);
                    if (normalizedTitleName.length < normalizedPrefix.length || normalizedTitleName.substring(0, normalizedPrefix.length) !== normalizedPrefix) {
                        break;
                }
                return iterator.advance().then(function(title) {
                    if (title == null)
                        return titles;
                    // check whether this title really starts with the prefix
                    var name = normalize(title.name);
                    if (name.length < prefix.length || name.substring(0, prefix.length) != prefix)
                        return titles;
                    titles.push(title);
                    return addNext();
                });
            }
-                callbackFunction(titleList.slice(0, i));
+            return addNext();
-            });
+        }).then(callbackFunction, errorHandler);
        });
    };
@ -950,6 +858,18 @@ define(function(require) {
        }
    };
    /**
     * Returns a function that normalizes strings if the current archive is compatible.
     * If it is not, returns the identity function.
     */
    LocalArchive.prototype.getNormalizeFunction = function() {
        if (this.normalizedTitles === true) {
            return normalize_string.normalizeString;
        } else {
            return function(string) { return string; }
        }
    };
    /**
     * ErrorHandler for FileReader
     * @param {type} evt
--- a/www/js/lib/titleIterators.js
+++ b/www/js/lib/titleIterators.js
@ -0,0 +1,121 @@
 /**
 * titleIterators.js : Various classes to iterate over titles, for example as a
 * result of searching.
 * 
 * Copyright 2014 Evopedia developers
 * License GPL v3:
 * 
 * This file is part of Evopedia.
 * 
 * Evopedia is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * Evopedia is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with Evopedia (file LICENSE-GPLv3.txt).  If not, see <http://www.gnu.org/licenses/>
 */
 define(['utf8', 'title', 'util', 'jquery'], function(utf8, evopediaTitle, util, jQuery) {
    // Maximum length of a title
    // 300 bytes is arbitrary : we actually do not really know how long the titles will be
    // But mediawiki titles seem to be limited to ~200 bytes, so 300 should be more than enough
    var MAX_TITLE_LENGTH = 300;
    /**
     * Iterates over all titles starting at the given offset.
     * The asynchronous method advance has to be called before this.title is
     * valid.
     */
    function SequentialTitleIterator(archive, offset) {
        this._titleFile = archive.titleFile;
        this._archive = archive;
        this._offset = offset;
        this.title = null;
    }
    /**
     * Advances to the next title (or the first), if possible.
     * @returns jQuery promise containing the next title or null if there is no
     * next title
     */
    SequentialTitleIterator.prototype.advance = function() {
        if (this._offset >= this._titleFile.size) {
            this.title = null;
            return jQuery.when(this.title);
        }
        var that = this;
        return util.readFileSlice(this._titleFile, this._offset,
                                  this._offset + MAX_TITLE_LENGTH).then(function(byteArray) {
            var newLineIndex = 15;
            while (newLineIndex < byteArray.length && byteArray[newLineIndex] != 10) {
                newLineIndex++;
            }
            var encodedTitle = byteArray.subarray(0, newLineIndex);
            that._title = evopediaTitle.Title.parseTitle(encodedTitle, that._archive, that._offset);
            that._offset += newLineIndex + 1;
            return that._title;
        });
    }
    /**
     * Searches for the offset into the given title file where the first title
     * with the given prefix (or lexicographically larger) is located.
     * The given function normalize is applied to every title before comparison.
     * @returns jQuery promise giving the offset
     */
    function FindPrefixOffset(titleFile, prefix, normalize) {
        prefix = normalize(prefix);
        var lo = 0;
        var hi = titleFile.size;
        var iterate = function() {
            if (lo >= hi) {
                if (lo > 0)
                    lo += 2; // Let lo point to the start of an entry
                return jQuery.when(lo);
            } else {
                var mid = Math.floor((lo + hi) / 2);
                return util.readFileSlice(titleFile, mid, mid + MAX_TITLE_LENGTH).then(function(byteArray) {
                    // Look for the index of the next NewLine
                    var newLineIndex = 0;
                    while (newLineIndex < byteArray.length && byteArray[newLineIndex] !== 10) {
                        newLineIndex++;
                    }
                    var startIndex = 0;
                    if (mid > 0) {
                        startIndex = newLineIndex + 16;
                        newLineIndex = startIndex;
                        // Look for the index of the next NewLine	
                        while (newLineIndex < byteArray.length && byteArray[newLineIndex] !== 10) {
                            newLineIndex++;
                        }
                    }
                    if (newLineIndex === startIndex) {
                        // End of file reached
                        hi = mid;
                    } else {
                        var normalizedTitle = normalize(utf8.parse(byteArray.subarray(startIndex, newLineIndex)));
                        if (normalizedTitle < prefix) {
                            lo = mid + newLineIndex - 1;
                        } else {
                            hi = mid;
                        }
                    }
                    return iterate();
                });
            }
        }
        return iterate();
    }
    /**
     * Functions and classes exposed by this module
     */
    return {
        SequentialTitleIterator : SequentialTitleIterator,
        FindPrefixOffset : FindPrefixOffset
    };
 });
--- a/www/js/lib/util.js
+++ b/www/js/lib/util.js
@ -20,6 +20,7 @@
 * along with Evopedia (file LICENSE-GPLv3.txt).  If not, see <http://www.gnu.org/licenses/>
 */
 define(function(require) {
    var jQuery = require('jquery');
    /**
     * Utility function : return true if the given string ends with the suffix
@ -113,6 +114,25 @@ define(function(require) {
        return (r > 0 ? enc.slice(0, r - 3) : enc) + '==='.slice(r || 3);
    }
    /**
     * Reads a Uint8Array from the given file starting at byte offset begin and
     * not including byte offset end.
     * @returns jQuery promise
     */
    function readFileSlice(file, begin, end) {
        var deferred = jQuery.Deferred();
        var reader = new FileReader();
        reader.onload = function(e) {
            deferred.resolve(new Uint8Array(e.target.result));
        }
        reader.onerror = reader.onabort = function(e) {
            deferred.reject(e);
        }
        reader.readAsArrayBuffer(file.slice(begin, end));
        return deferred.promise();
    }
    /**
     * Functions and classes exposed by this module
     */
@ -122,6 +142,7 @@ define(function(require) {
        readIntegerFrom2Bytes : readIntegerFrom2Bytes,
        readFloatFrom4Bytes : readFloatFrom4Bytes,
        uint8ArrayToHex : uint8ArrayToHex,
-        uint8ArrayToBase64 : uint8ArrayToBase64
+        uint8ArrayToBase64 : uint8ArrayToBase64,
        readFileSlice : readFileSlice
    };
 });