Merge pull request #160 from kiwix/issue124-title-search-with-case-variants

On title search, try some upper/lower case variants.
This commit is contained in:
Mossroy 2016-01-16 14:46:46 +01:00
commit f6ea93903a
3 changed files with 171 additions and 6 deletions

View File

@ -377,6 +377,21 @@ define(['jquery', 'title', 'archive', 'zimArchive', 'zimDirEntry', 'util', 'geom
var float = util.readFloatFrom4Bytes(byteArray, 0);
equal(float, -118.625, "the IEEE_754 float should be converted as -118.625");
});
test("check upper/lower case variations", function() {
var testString1 = "téléphone";
var testString2 = "Paris";
var testString3 = "le Couvre-chef Est sur le porte-manteaux";
var testString4 = "épée";
equal(util.ucFirstLetter(testString1), "Téléphone", "The first letter should be upper-case");
equal(util.lcFirstLetter(testString2), "paris", "The first letter should be lower-case");
equal(util.ucEveryFirstLetter(testString3), "Le Couvre-Chef Est Sur Le Porte-Manteaux", "The first letter of every word should be upper-case");
equal(util.ucFirstLetter(testString4), "Épée", "The first letter should be upper-case (with accent)");
});
test("check remove duplicates of an array of title objects", function() {
var array = [{title:"a"}, {title:"b"}, {title:"c"}, {title:"a"}, {title:"c"}, {title:"d"}];
var expectedArray = [{title:"a"}, {title:"b"}, {title:"c"}, {title:"d"}];
deepEqual(util.removeDuplicateTitlesInArray(array), expectedArray, "Duplicates should be removed from the array");
});
module("evopedia_articles_nearby");
asyncTest("check articles found nearby France and Germany", function() {
@ -498,9 +513,7 @@ define(['jquery', 'title', 'archive', 'zimArchive', 'zimDirEntry', 'util', 'geom
module("zim_title_search_and_read");
asyncTest("check DirEntry.fromStringId 'A Fool for You'", function() {
// Construct the DirEntry for Arikitcac article
// NB : this must be done inside a test or asyncTest function, else the localZimArchive is not ready yet
var arikitcacDirEntry = zimDirEntry.DirEntry.fromStringId(localZimArchive._file, "5856|7|A|0|2|A_Fool_for_You.html|A Fool for You|false|undefined");
var aFoolForYouDirEntry = zimDirEntry.DirEntry.fromStringId(localZimArchive._file, "5856|7|A|0|2|A_Fool_for_You.html|A Fool for You|false|undefined");
expect(2);
var callbackFunction = function(title, htmlArticle) {
@ -510,7 +523,37 @@ define(['jquery', 'title', 'archive', 'zimArchive', 'zimDirEntry', 'util', 'geom
ok(htmlArticle.match("^.*<h1[^>]*>A Fool for You</h1>"), "'A Fool for You' title somewhere in the article");
start();
};
localZimArchive.readArticle(arikitcacDirEntry, callbackFunction);
localZimArchive.readArticle(aFoolForYouDirEntry, callbackFunction);
});
asyncTest("check findTitlesWithPrefix 'A'", function() {
expect(2);
var callbackFunction = function(titleList) {
ok(titleList && titleList.length === 5, "Article list with 5 results");
var firstTitle = titleList[0];
equal(firstTitle.title , 'A Fool for You', 'First result should be "A Fool for You"');
start();
};
localZimArchive.findTitlesWithPrefix('A', 5, callbackFunction);
});
asyncTest("check findTitlesWithPrefix 'a'", function() {
expect(2);
var callbackFunction = function(titleList) {
ok(titleList && titleList.length === 5, "Article list with 5 results");
var firstTitle = titleList[0];
equal(firstTitle.title , 'A Fool for You', 'First result should be "A Fool for You"');
start();
};
localZimArchive.findTitlesWithPrefix('a', 5, callbackFunction);
});
asyncTest("check findTitlesWithPrefix 'blues brothers'", function() {
expect(2);
var callbackFunction = function(titleList) {
ok(titleList && titleList.length === 3, "Article list with 3 result");
var firstTitle = titleList[0];
equal(firstTitle.title , 'Blues Brothers (film)', 'First result should be "Blues Brothers (film)"');
start();
};
localZimArchive.findTitlesWithPrefix('blues brothers', 5, callbackFunction);
});
asyncTest("article '(The Night Time Is) The Right Time' correctly redirects to 'Night Time Is the Right Time'", function() {
expect(6);

View File

@ -32,6 +32,93 @@ define(['q'], function(q) {
return str.indexOf(suffix, str.length - suffix.length) !== -1;
}
/**
* Returns the same String with the first letter in upper-case
* @param {String} string
* @returns {String}
*/
function ucFirstLetter(string) {
if (string && string.length >= 1) {
return string[0].toLocaleUpperCase() + string.slice(1);
} else {
return string;
}
}
/**
* Returns the same String with the first letter in lower-case
* @param {String} string
* @returns {String}
*/
function lcFirstLetter(string) {
if (string) {
if (string.length >= 1) {
return string.charAt(0).toLocaleLowerCase() + string.slice(1);
} else {
return string;
}
} else {
return string;
}
}
/**
* Returns the same String with the first letter of every word in upper-case
* @param {String} string
* @returns {String}
*/
function ucEveryFirstLetter(string) {
if (string) {
return string.replace( /\b\w/g, function (m) {
return m.toLocaleUpperCase();
});
} else {
return string;
}
}
/**
* Generates an array of Titles, where all duplicates have been removed
* (it also sorts the titles)
*
* @param {Array.<Title>} array of Titles
* @returns {Array.<Title>} same array of Titles, without duplicates
*/
function removeDuplicateTitlesInArray(array) {
array.sort(function(titleA, titleB) {
if (titleA.title < titleB.title) return -1;
if (titleA.title > titleB.title) return 1;
return 0;
});
for(var i = 1; i < array.length; ){
if(array[i-1].title === array[i].title){
array.splice(i, 1);
} else {
i++;
}
}
return array;
}
/**
* Generates an array of Strings, where all duplicates have been removed
* (without changing the order)
* It is optimized for small arrays.
* Source : http://codereview.stackexchange.com/questions/60128/removing-duplicates-from-an-array-quickly
*
* @param {Array.<Title>} array of String
* @returns {Array.<Title>} same array of Strings, without duplicates
*/
function removeDuplicateStringsInSmallArray(array) {
var unique = [];
for (var i = 0; i < array.length; i++) {
var current = array[i];
if (unique.indexOf(current) < 0)
unique.push(current);
}
return unique;
}
/**
* Read an integer encoded in 4 bytes, little endian
* @param {Array} byteArray
@ -223,6 +310,11 @@ define(['q'], function(q) {
*/
return {
endsWith: endsWith,
ucFirstLetter: ucFirstLetter,
lcFirstLetter: lcFirstLetter,
ucEveryFirstLetter: ucEveryFirstLetter,
removeDuplicateTitlesInArray: removeDuplicateTitlesInArray,
removeDuplicateStringsInSmallArray: removeDuplicateStringsInSmallArray,
readIntegerFrom4Bytes: readIntegerFrom4Bytes,
readIntegerFrom2Bytes : readIntegerFrom2Bytes,
readFloatFrom4Bytes : readFloatFrom4Bytes,

View File

@ -109,13 +109,43 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
*/
/**
* Look for titles starting with the given prefix.
* For now, ZIM titles are case sensitive.
* So, as workaround, we try several variants of the prefix to find more results.
* This should be enhanced when the ZIM format will be modified to store normalized titles
* See https://phabricator.wikimedia.org/T108536
*
* @param {String} prefix
* @param {Integer} resultSize
* @param {type} callback
* @returns {callbackTitleList}
* @param {callbackTitleList} callback
*/
ZIMArchive.prototype.findTitlesWithPrefix = function(prefix, resultSize, callback) {
var that = this;
var prefixVariants = util.removeDuplicateStringsInSmallArray([prefix, util.ucFirstLetter(prefix), util.lcFirstLetter(prefix), util.ucEveryFirstLetter(prefix)]);
var titles = [];
function searchNextVariant() {
if (prefixVariants.length === 0 || titles.length >= resultSize) {
callback(titles);
return;
}
var prefix = prefixVariants[0];
prefixVariants = prefixVariants.slice(1);
that.findTitlesWithPrefixCaseSensitive(prefix, resultSize - titles.length, function (newTitles) {
titles.push.apply(titles, newTitles);
searchNextVariant();
});
}
searchNextVariant();
};
/**
* Look for titles starting with the given prefix (case-sensitive)
*
* @param {String} prefix
* @param {Integer} resultSize
* @param {callbackTitleList} callback
*/
ZIMArchive.prototype.findTitlesWithPrefixCaseSensitive = function(prefix, resultSize, callback) {
var that = this;
util.binarySearch(0, this._file.articleCount, function(i) {
return that._file.dirEntryByTitleIndex(i).then(function(dirEntry) {