Improvements in case variants for title search.

Based on the suggestions of peter-x in #160
The code is more readable, and should be a bit faster.
This commit is contained in:
mossroy 2016-01-16 14:44:09 +01:00
parent 8cf42f8140
commit a1e581ff99
3 changed files with 38 additions and 40 deletions

View File

@ -381,7 +381,7 @@ define(['jquery', 'title', 'archive', 'zimArchive', 'zimDirEntry', 'util', 'geom
test("check remove duplicates of an array of title objects", function() {
var array = [{title:"a"}, {title:"b"}, {title:"c"}, {title:"a"}, {title:"c"}, {title:"d"}];
var expectedArray = [{title:"a"}, {title:"b"}, {title:"c"}, {title:"d"}];
deepEqual(util.removeDuplicatesInArray(array), expectedArray, "Duplicates should be removed from the array");
deepEqual(util.removeDuplicateTitlesInArray(array), expectedArray, "Duplicates should be removed from the array");
});
module("evopedia_articles_nearby");

View File

@ -38,12 +38,8 @@ define(['q'], function(q) {
* @returns {String}
*/
function ucFirstLetter(string) {
if (string) {
if (string.length >= 1) {
return string.charAt(0).toLocaleUpperCase() + string.slice(1);
} else {
return string;
}
if (string && string.length >= 1) {
return string[0].toLocaleUpperCase() + string.slice(1);
} else {
return string;
}
@ -104,6 +100,25 @@ define(['q'], function(q) {
return array;
}
/**
* Generates an array of Strings, where all duplicates have been removed
* (without changing the order)
* It is optimized for small arrays.
* Source : http://codereview.stackexchange.com/questions/60128/removing-duplicates-from-an-array-quickly
*
* @param {Array.<Title>} array of String
* @returns {Array.<Title>} same array of Strings, without duplicates
*/
function removeDuplicateStringsInSmallArray(array) {
var unique = [];
for (var i = 0; i < array.length; i++) {
var current = array[i];
if (unique.indexOf(current) < 0)
unique.push(current);
}
return unique;
}
/**
* Read an integer encoded in 4 bytes, little endian
* @param {Array} byteArray
@ -298,7 +313,8 @@ define(['q'], function(q) {
ucFirstLetter: ucFirstLetter,
lcFirstLetter: lcFirstLetter,
ucEveryFirstLetter: ucEveryFirstLetter,
removeDuplicatesInArray: removeDuplicateTitlesInArray,
removeDuplicateTitlesInArray: removeDuplicateTitlesInArray,
removeDuplicateStringsInSmallArray: removeDuplicateStringsInSmallArray,
readIntegerFrom4Bytes: readIntegerFrom4Bytes,
readIntegerFrom2Bytes : readIntegerFrom2Bytes,
readFloatFrom4Bytes : readFloatFrom4Bytes,

View File

@ -113,7 +113,7 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
* For now, ZIM titles are case sensitive.
* So, as workaround, we try several variants of the prefix to find more results.
* This should be enhanced when the ZIM format will be modified to store normalized titles
* See https://github.com/mossroy/evopedia-html5/issues/117
* See https://phabricator.wikimedia.org/T108536
*
* @param {String} prefix
* @param {Integer} resultSize
@ -121,39 +121,21 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
*/
ZIMArchive.prototype.findTitlesWithPrefix = function(prefix, resultSize, callback) {
var that = this;
that.findTitlesWithPrefixCaseSensitive(prefix, resultSize, function(titles) {
if (titles.length < resultSize) {
// Let's add results with first letter upper-case
var ucPrefix = util.ucFirstLetter(prefix);
that.findTitlesWithPrefixCaseSensitive(ucPrefix, resultSize, function(ucTitles) {
titles.push.apply(titles, ucTitles);
titles = util.removeDuplicatesInArray(titles);
if (titles.length < resultSize) {
// Let's add results with first letter lower-case
var lcPrefix = util.ucFirstLetter(prefix);
that.findTitlesWithPrefixCaseSensitive(lcPrefix, resultSize, function(lcTitles) {
titles.push.apply(titles, lcTitles);
titles = util.removeDuplicatesInArray(titles);
if (titles.length < resultSize) {
// Let's add results with first letter of every word upper-case
var ucEveryWordPrefix = util.ucEveryFirstLetter(prefix);
that.findTitlesWithPrefixCaseSensitive(ucEveryWordPrefix, resultSize, function (ucEveryTitles) {
titles.push.apply(titles, ucEveryTitles);
titles = util.removeDuplicatesInArray(titles);
callback(titles);
});
} else {
callback(titles);
}
});
} else {
callback(titles);
}
});
} else {
var prefixVariants = util.removeDuplicateStringsInSmallArray([prefix, util.ucFirstLetter(prefix), util.lcFirstLetter(prefix), util.ucEveryFirstLetter(prefix)]);
var titles = [];
function searchNextVariant() {
if (prefixVariants.length === 0 || titles.length >= resultSize) {
callback(titles);
return;
}
});
var prefix = prefixVariants[0];
prefixVariants = prefixVariants.slice(1);
that.findTitlesWithPrefixCaseSensitive(prefix, resultSize - titles.length, function (newTitles) {
titles.push.apply(titles, newTitles);
searchNextVariant();
});
}
searchNextVariant();
};
/**