First usable version of article searching

Removed possibility to list all articles (much too slow)
Added the removal of diacritics (accents etc) in the user-typed prefix
This commit is contained in:
mossroy 2012-12-28 21:48:46 +01:00
parent c598de1ac5
commit 638d2f8815
4 changed files with 80 additions and 72 deletions

1
.gitignore vendored
View File

@ -1,2 +1,3 @@
evopedia-html5/WebContent/evopedia.html~
evopedia-html5/WebContent/evopedia.js~
evopedia-html5/WebContent/remove_diacritics.js~

View File

@ -31,6 +31,7 @@ License:
<title>Evopedia HTML5</title>
<script type="text/javascript" src="bzip2-antimatter15.js"></script>
<script type="text/javascript" src="remove_diacritics.js"></script>
</head>
<body>
@ -43,11 +44,9 @@ Please pick the files wikipedia_*.dat from the same dump :<br/>
<input type="file" id="dataFiles" multiple="true"/>
</div>
<br/>
Find a title : <input type="text" id="prefix" value="" />&nbsp;<input type="button" id="searchTitles" value="Search titles" onclick="searchTitlesFromPrefix(titleFile,document.getElementById('prefix').value)" />
Find titles from the prefix : <input type="text" id="prefix" value="" onkeyup="onKeyUpPrefix(event)" />&nbsp;<input type="button" id="searchTitles" value="Search titles" onclick="searchTitlesFromPrefix(titleFile,document.getElementById('prefix').value)" />
<br/>
<input type="button" id="readTitle" value="Read all title list from index" onclick="readAllTitlesFromIndex(titleFile)" />
<br/>
Choose a title : <select id="titleList" onchange="updateOffsetsFromTitle(this.value)"></select>
Choose a title from the filtered list : <select id="titleList" onchange="updateOffsetsFromTitle(this.value)"></select>
<br/>
File number : <input type="text" id="filenumber" value="0" />
<br/>
@ -62,7 +61,7 @@ Length : <input type="text" id="length" value="8866" />
<!-- TODO : add CSS styles -->
<div id="articleContent">&nbsp;</div>
<hr/>
<textarea id="rawArticleContent" cols="80" rows="20">&nbsp;</textarea>
<textarea id="debugZone" cols="80" rows="20">&nbsp;</textarea>
<script type="text/javascript" src="evopedia.js"></script>
</body>

View File

@ -131,6 +131,8 @@ function recursivePrefixSearch(titleFile, reader, prefix, lo, hi) {
reader.readAsArrayBuffer(blob);
}
else {
// We found the closest title
//alert ("Found the closest title near index "+lo);
readTitlesBeginningAtIndexStartingWithPrefix(titleFile,prefix,lo);
}
}
@ -142,7 +144,7 @@ function recursivePrefixSearch(titleFile, reader, prefix, lo, hi) {
function searchTitlesFromPrefix(titleFile, prefix) {
if (titleFile) {
var titleFileSize = titleFile.size;
// TODO : normalize the prefix (remove accents etc)
prefix = normalizeString(prefix);
var reader = new FileReader();
reader.onerror = errorHandler;
@ -156,77 +158,73 @@ function searchTitlesFromPrefix(titleFile, prefix) {
}
}
function readTitlesBeginningAtIndexStartingWithPrefix(titleFile,prefix,index) {
// We found the closest title
alert ("Found the closest title at index "+index);
// TODO : read the following titles, stopping when the title does not start with the prefix any more (or at a maximum number of titles)
}
/**
* Read all the titles from the index file, and populate the dropdown list
* Warning : only usable on very small dumps. It is much too long on normal dumps
* Read the titles following the given index in the title file, until one of the following conditions is reached :
* - the title does not start with the prefix anymore
* - we already read the maximum number of titles
* and populate the dropdown list
*/
function readAllTitlesFromIndex(titleFile) {
if (titleFile) {
var reader = new FileReader();
reader.onerror = errorHandler;
reader.onabort = function(e) {
alert('Title file read cancelled');
};
reader.onload = function(e) {
var binaryTitleFile = e.target.result;
var byteArray = new Uint8Array(binaryTitleFile);
function readTitlesBeginningAtIndexStartingWithPrefix(titleFile,prefix,startIndex) {
var reader = new FileReader();
reader.onerror = errorHandler;
reader.onabort = function(e) {
alert('Title file read cancelled');
};
reader.onload = function(e) {
var binaryTitleFile = e.target.result;
var byteArray = new Uint8Array(binaryTitleFile);
// Look for the index of the next NewLine
var newLineIndex=0;
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=128) {
newLineIndex++;
}
var i = newLineIndex;
var titleNumber=-1;
var comboTitleList = document.getElementById('titleList');
while (i<byteArray.length && titleNumber<50) {
var filenumber = 0;
var blockstart = 0;
var blockoffset = 0;
var length = 0;
var title = "";
// TODO : interpret escape area
var escape1 = byteArray[i];
var escape2 = byteArray[i+1];
filenumber = byteArray[i+2];
var i = 0;
var titleNumber=0;
var comboTitleList = document.getElementById('titleList');
blockstart = readIntegerFrom4Bytes(byteArray,i+3);
blockoffset = readIntegerFrom4Bytes(byteArray,i+7);
length = readIntegerFrom4Bytes(byteArray,i+11);
var newLineIndex = i+15;
while (i<byteArray.length) {
var filenumber = 0;
var blockstart = 0;
var blockoffset = 0;
var length = 0;
var title = "";
// TODO : interpret escape area
var escape1 = byteArray[i];
var escape2 = byteArray[i+1];
filenumber = byteArray[i+2];
blockstart = readIntegerFrom4Bytes(byteArray,i+3);
blockoffset = readIntegerFrom4Bytes(byteArray,i+7);
length = readIntegerFrom4Bytes(byteArray,i+11);
var newLineIndex = i+15;
// Look for the index of NewLine
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=128) {
newLineIndex++;
}
title = utf8ByteArrayToString(byteArray,i+15,newLineIndex);
if (title) {
comboTitleList.options[titleNumber] = new Option (title, filenumber+"|"+blockstart+"|"+blockoffset+"|"+length);
}
titleNumber++;
i=newLineIndex-1;
// Look for the index of the next NewLine
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=128) {
newLineIndex++;
}
};
var blob = titleFile;
// Read in the file as a binary string
reader.readAsArrayBuffer(blob);
}
else {
alert('Title file not set');
}
title = utf8ByteArrayToString(byteArray,i+15,newLineIndex);
// Skip the first title
if (titleNumber>=0 && title) {
// TODO : check if the title starts with prefix, and return if it does not
comboTitleList.options[titleNumber] = new Option (title, filenumber+"|"+blockstart+"|"+blockoffset+"|"+length);
}
titleNumber++;
i=newLineIndex-1;
}
// Run onchange on the combo, so that to read the value of the selected item (first one)
comboTitleList.onchange();
};
var blob = titleFile.slice(startIndex);
// Read in the file as a binary string
reader.readAsArrayBuffer(blob);
}
/**
* Decompress and read an article in dump files
*/
function readArticleFromHtmlForm(dataFiles) {
document.getElementById("articleContent").innerHTML="Loading article from dump...";
if (dataFiles && dataFiles.length>0) {
var filenumber = document.getElementById('filenumber').value;
var blockstart = document.getElementById('blockstart').value;
@ -289,7 +287,7 @@ function readArticleFromOffset(dataFile, blockstart, blockoffset, length) {
document.getElementById('articleContent').innerHTML = htmlArticle;
// For testing purpose
//document.getElementById('rawArticleContent').value = htmlArticle;
//document.getElementById('debugZone').value = htmlArticle;
};
// TODO : should be improved by reading the file chunks by chunks until the article is found,
@ -322,3 +320,12 @@ function handleDataFileSelect(evt) {
function handleTitleFileSelect(evt) {
titleFile = evt.target.files[0];
}
/**
* Handle Enter key in the prefix input zone
*/
function onKeyUpPrefix(evt) {
if (evt.keyCode == 13) {
document.getElementById("searchTitles").click();
}
}

View File

@ -182,9 +182,10 @@ var diacriticsMap = [
];
for(var i=0; i<diacriticsMap.length; i++) {
permalink = stringWithDiacritics.replace(diacriticsMap[i].letters, diacriticsMap[i].base);
function normalizeString(string) {
var normalizedString=string;
for(var i=0; i<diacriticsMap.length; i++) {
normalizedString = normalizedString.replace(diacriticsMap[i].letters, diacriticsMap[i].base);
}
return normalizedString;
}