mirror of
https://github.com/kiwix/kiwix-js.git
synced 2025-09-22 12:01:15 -04:00
Big refactoring of the code, in a more object-oriented way.
But I could not push it as much as I would like : the FileReader API is asynchronous, which prevents some implementations (ex : a constructor in a class cannot read files) The title searching is not very accurate for now, which is why the links do not work very well for now
This commit is contained in:
parent
6c5020b42a
commit
7802cbf3b8
@ -48,16 +48,16 @@ License:
|
|||||||
<br />
|
<br />
|
||||||
To use it, you have to first download locally a dump from <a href="http://dumpathome.evopedia.info/dumps/finished">http://dumpathome.evopedia.info/dumps/finished</a> (with a Bittorrent client), and select some of the dowloaded files below.
|
To use it, you have to first download locally a dump from <a href="http://dumpathome.evopedia.info/dumps/finished">http://dumpathome.evopedia.info/dumps/finished</a> (with a Bittorrent client), and select some of the dowloaded files below.
|
||||||
<br />
|
<br />
|
||||||
Current status : I have tested it with the <a href="http://evopedia.info/dumps/wikipedia_small_2010-08-14.torrent">small dump (2010-08-14)</a>, the <a href="http://evopedia.info/dumps/wikipedia_fr_2012-02-03.torrent">French dump (2012-02-03)</a>, the <a href="http://evopedia.info/dumps/wikipedia_frwiktionary_2011-03-16.torrent">French wiktionary dump (2011-03-16)</a> and the <a href="http://evopedia.info/dumps/wikipedia_en_2012-02-11.torrent">English dump (2012-02-11)</a>
|
I have tested it with the <a href="http://evopedia.info/dumps/wikipedia_small_2010-08-14.torrent">small dump (2010-08-14)</a>, the <a href="http://evopedia.info/dumps/wikipedia_fr_2012-02-03.torrent">French dump (2012-02-03)</a>, the <a href="http://evopedia.info/dumps/wikipedia_frwiktionary_2011-03-16.torrent">French wiktionary dump (2011-03-16)</a> and the <a href="http://evopedia.info/dumps/wikipedia_en_2012-02-11.torrent">English dump (2012-02-11)</a>
|
||||||
<br />
|
<br />
|
||||||
<br />
|
<br />
|
||||||
<ul>
|
<ul>
|
||||||
<li>On desktops, it works at least on recent Firefox and Chrome</li>
|
<li>On desktops, it works on recent Firefox and Chrome, and maybe on other browsers</li>
|
||||||
<li>On the Firefos OS simulator, you have (for now) to put the small dump files in a "fake-sdcard" folder of your firefox profile (ex : ~/.mozilla/firefox/xxxx.default/extensions/r2d2b2g@mozilla.org/profile/fake-sdcard). It looks for wikipedia_small_2010-08-14/titles.idx in it. You also need to install the application from the dashboard of the simulator instead of accessing via the browser (due to security restrictions in Firefox OS : only certified webapps can access the sdcard)</li>
|
<li>On the Firefos OS simulator, you have (for now) to put the small dump files in a "fake-sdcard" folder of your firefox profile (ex : ~/.mozilla/firefox/xxxx.default/extensions/r2d2b2g@mozilla.org/profile/fake-sdcard). It looks for wikipedia_small_2010-08-14/titles.idx in it. You also need to install the application from the dashboard of the simulator instead of accessing via the browser (due to security restrictions in Firefox OS : only certified webapps can access the sdcard)</li>
|
||||||
<li>I could not test it on a real Firefox OS device : if someone did, please let me know</li>
|
<li>I could not test it on a real Firefox OS device : if someone did, please let me know</li>
|
||||||
</ul>
|
</ul>
|
||||||
<br />
|
<br />
|
||||||
It's only a proof of concept sor far : there are certainly many many ways this could be enhanced (suggestions and patches are welcome : the source code is on <a href="https://github.com/mossroy/evopedia-html5">github</a>). In particular, the performance can be optimized when reading an article. I also know the links inside an article do not work very well for now.
|
It's only a proof of concept so far : there are many many ways this could be enhanced (suggestions and patches are welcome : the source code is on <a href="https://github.com/mossroy/evopedia-html5">github</a>). In particular, the performance can be optimized when reading an article. I also know the searches are not always very accurate, and the links inside an article do not work well for now.
|
||||||
<br />
|
<br />
|
||||||
<div id="openLocalFiles" style="display: none;">
|
<div id="openLocalFiles" style="display: none;">
|
||||||
<br /> Please select the file titles.idx :<br /> <input type="file"
|
<br /> Please select the file titles.idx :<br /> <input type="file"
|
||||||
|
541
www/js/app.js
541
www/js/app.js
@ -18,419 +18,200 @@ define(function(require) {
|
|||||||
require('./install-button');
|
require('./install-button');
|
||||||
|
|
||||||
// Evopedia javascript dependencies
|
// Evopedia javascript dependencies
|
||||||
var bzip2 = require('bzip2');
|
|
||||||
var remove_diacritics = require('remove_diacritics');
|
|
||||||
var evopedia = require('evopedia');
|
var evopedia = require('evopedia');
|
||||||
|
|
||||||
|
|
||||||
var dataFiles=document.getElementById('dataFiles').files;
|
var localArchive = null;
|
||||||
var titleFile=document.getElementById('titleFile').files[0];
|
setLocalArchiveFromFileSelect();
|
||||||
|
|
||||||
// Define behavior of HTML elements
|
// Define behavior of HTML elements
|
||||||
$('#searchTitles').on('click', function(e) {
|
$('#searchTitles').on('click', function(e) {
|
||||||
searchTitlesFromPrefix(titleFile,$('#prefix').val());
|
searchTitlesFromPrefix($('#prefix').val());
|
||||||
});
|
});
|
||||||
$('#titleList').on('change', function(e) {
|
$('#toggleDebug').on('click', function(e) {
|
||||||
updateOffsetsFromTitle(this.value);
|
switchDebugOnOff();
|
||||||
});
|
});
|
||||||
$('#toggleDebug').on('click', function(e) {
|
$('#readData').on('click', function(e) {
|
||||||
switchDebugOnOff();
|
findTitleFromTitleIdAndLaunchArticleRead($('#titleList').val());
|
||||||
});
|
});
|
||||||
$('#readData').on('click', function(e) {
|
$('#prefix').on('keyup', function(e) {
|
||||||
readArticleFromHtmlForm(dataFiles);
|
onKeyUpPrefix(e);
|
||||||
});
|
});
|
||||||
$('#prefix').on('keyup', function(e) {
|
|
||||||
onKeyUpPrefix(e);
|
|
||||||
});
|
|
||||||
|
|
||||||
|
|
||||||
// Detect if DeviceStorage is available
|
// Detect if DeviceStorage is available
|
||||||
var storage = null;
|
var storage = null;
|
||||||
if ($.isFunction(navigator.getDeviceStorage)) {
|
if ($.isFunction(navigator.getDeviceStorage)) {
|
||||||
storage = navigator.getDeviceStorage('sdcard');
|
storage = navigator.getDeviceStorage('sdcard');
|
||||||
}
|
}
|
||||||
|
|
||||||
if (storage != null) {
|
if (storage != null) {
|
||||||
var filerequest = storage.get('wikipedia_small_2010-08-14/wikipedia_00.dat');
|
var filerequest = storage.get('wikipedia_small_2010-08-14/wikipedia_00.dat');
|
||||||
filerequest.onsuccess = function() {
|
filerequest.onsuccess = function() {
|
||||||
dataFiles = [];
|
localArchive = new evopedia.LocalArchive();
|
||||||
dataFiles[0] = filerequest.result;
|
localArchive.dataFiles[0] = filerequest.result;
|
||||||
filerequest = storage.get('wikipedia_small_2010-08-14/titles.idx');
|
filerequest = storage.get('wikipedia_small_2010-08-14/titles.idx');
|
||||||
filerequest.onsuccess = function() {
|
filerequest.onsuccess = function() {
|
||||||
titleFile = filerequest.result;
|
localArchive.titleFile = filerequest.result;
|
||||||
};
|
};
|
||||||
filerequest.onerror = function(event) {
|
filerequest.onerror = function(event) {
|
||||||
alert("error reading title file : " + event.target.error.name);
|
alert("error reading title file : " + event.target.error.name);
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
filerequest.onerror = function(event) {
|
filerequest.onerror = function(event) {
|
||||||
alert("error reading data file : " + event.target.error.name);
|
alert("error reading data file : " + event.target.error.name);
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
displayFileSelect();
|
displayFileSelect();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Displays the zone to select files from the dump
|
* Displays the zone to select files from the dump
|
||||||
*/
|
*/
|
||||||
function displayFileSelect() {
|
function displayFileSelect() {
|
||||||
$('#openLocalFiles').show();
|
$('#openLocalFiles').show();
|
||||||
document.getElementById('dataFiles').addEventListener('change', handleDataFileSelect, false);
|
$('#dataFiles').on('change', setLocalArchiveFromFileSelect);
|
||||||
document.getElementById('titleFile').addEventListener('change', handleTitleFileSelect, false);
|
$('#titleFile').on('change', setLocalArchiveFromFileSelect);
|
||||||
}
|
|
||||||
|
|
||||||
var debugOn = false;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Print the given string inside the debug zone
|
|
||||||
* @param string
|
|
||||||
*/
|
|
||||||
function debug(string) {
|
|
||||||
if (debugOn) {
|
|
||||||
document.getElementById("debugTextarea").value+=string+"\n";
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
var debugOn = false;
|
||||||
* Switch debug mode On/Off
|
|
||||||
*/
|
|
||||||
function switchDebugOnOff() {
|
|
||||||
if (debugOn == true) {
|
|
||||||
debugOn = false;
|
|
||||||
$('#debugZone').hide();
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
debugOn = true;
|
|
||||||
$('#debugZone').show();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set the Offsets HTML fields from the selected title
|
* Print the given string inside the debug zone
|
||||||
*/
|
*
|
||||||
function updateOffsetsFromTitle(selectValue) {
|
* @param string
|
||||||
var offsets=selectValue.split(/\|/);
|
*/
|
||||||
document.getElementById("filenumber").value=offsets[0];
|
function debug(string) {
|
||||||
document.getElementById("blockstart").value=offsets[1];
|
if (debugOn) {
|
||||||
document.getElementById("blockoffset").value=offsets[2];
|
document.getElementById("debugTextarea").value += string + "\n";
|
||||||
document.getElementById("length").value=offsets[3];
|
|
||||||
if (offsets[0]==255) {
|
|
||||||
// It's a redirect : find out the real offsets (asynchronous read)
|
|
||||||
readRedirectOffsets(titleFile,offsets[1]);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
document.getElementById('redirectfilenumber').value = "";
|
|
||||||
document.getElementById('redirectblockstart').value = "";
|
|
||||||
document.getElementById('redirectblockoffset').value = "";
|
|
||||||
document.getElementById('redirectlength').value = "";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This function is recursively called after each asynchronous read,
|
|
||||||
* so that to find the closest index in titleFile to the given prefix
|
|
||||||
*/
|
|
||||||
function recursivePrefixSearch(titleFile, reader, prefix, lo, hi) {
|
|
||||||
if (lo < hi-1 ) {
|
|
||||||
var mid = Math.round((lo+hi)/2);
|
|
||||||
// TODO : improve the way we read this file : 256 bytes is arbitrary and might be too small
|
|
||||||
var blob = titleFile.slice(mid,mid+256);
|
|
||||||
reader.onload = function(e) {
|
|
||||||
var binaryTitleFile = e.target.result;
|
|
||||||
var byteArray = new Uint8Array(binaryTitleFile);
|
|
||||||
// Look for the index of the next NewLine
|
|
||||||
var newLineIndex=0;
|
|
||||||
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=10) {
|
|
||||||
newLineIndex++;
|
|
||||||
}
|
|
||||||
var i = newLineIndex+1;
|
|
||||||
newLineIndex = i+15;
|
|
||||||
// Look for the index of the next NewLine
|
|
||||||
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=10) {
|
|
||||||
newLineIndex++;
|
|
||||||
}
|
|
||||||
var title = evopedia.utf8ByteArrayToString(byteArray,i+15,newLineIndex);
|
|
||||||
debug("title found : "+title);
|
|
||||||
if (title.localeCompare(prefix)<0) {
|
|
||||||
lo = mid;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
hi = mid;
|
|
||||||
}
|
|
||||||
recursivePrefixSearch(titleFile, reader, prefix, lo, hi);
|
|
||||||
};
|
|
||||||
debug("Reading the file from "+mid+" to "+(mid+256)+" because lo="+lo+" and hi="+hi);
|
|
||||||
// Read the file as a binary string
|
|
||||||
reader.readAsArrayBuffer(blob);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
// We found the closest title
|
|
||||||
debug ("Found the closest title near index "+lo);
|
|
||||||
readTitlesBeginningAtIndexStartingWithPrefix(titleFile,prefix,lo);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Search the index for titles that start with the given prefix
|
|
||||||
* (implemented with a binary search inside the index file)
|
|
||||||
*/
|
|
||||||
function searchTitlesFromPrefix(titleFile, prefix) {
|
|
||||||
if (titleFile) {
|
|
||||||
var titleFileSize = titleFile.size;
|
|
||||||
prefix = remove_diacritics.normalizeString(prefix);
|
|
||||||
|
|
||||||
var reader = new FileReader();
|
|
||||||
reader.onerror = errorHandler;
|
|
||||||
reader.onabort = function(e) {
|
|
||||||
alert('Title file read cancelled');
|
|
||||||
};
|
|
||||||
recursivePrefixSearch(titleFile, reader, prefix, 0, titleFileSize);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
alert ("Title file not set");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Read the real offsets when a redirect was found, based on the redirectIndex provided
|
|
||||||
* The file read is asynchronous, and populates the html form as soon as the offsets are found
|
|
||||||
* @param titleFile
|
|
||||||
* @param redirectIndex
|
|
||||||
*/
|
|
||||||
function readRedirectOffsets(titleFile,redirectIndex) {
|
|
||||||
var reader = new FileReader();
|
|
||||||
reader.onerror = errorHandler;
|
|
||||||
reader.onabort = function(e) {
|
|
||||||
alert('Title file read cancelled');
|
|
||||||
};
|
|
||||||
reader.onload = function(e) {
|
|
||||||
var binaryTitleFile = e.target.result;
|
|
||||||
var byteArray = new Uint8Array(binaryTitleFile);
|
|
||||||
var filenumber = byteArray[2];
|
|
||||||
|
|
||||||
var blockstart = evopedia.readIntegerFrom4Bytes(byteArray,3);
|
|
||||||
var blockoffset = evopedia.readIntegerFrom4Bytes(byteArray,7);
|
|
||||||
var length = evopedia.readIntegerFrom4Bytes(byteArray,11);
|
|
||||||
|
|
||||||
document.getElementById('redirectfilenumber').value = filenumber;
|
|
||||||
document.getElementById('redirectblockstart').value = blockstart;
|
|
||||||
document.getElementById('redirectblockoffset').value = blockoffset;
|
|
||||||
document.getElementById('redirectlength').value = length;
|
|
||||||
};
|
|
||||||
// Read only the 16 necessary bytes
|
|
||||||
var blob = titleFile.slice(redirectIndex,redirectIndex+16);
|
|
||||||
// Read in the file as a binary string
|
|
||||||
reader.readAsArrayBuffer(blob);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Read the titles following the given index in the title file, until one of the following conditions is reached :
|
|
||||||
* - the title does not start with the prefix anymore
|
|
||||||
* - we already read the maximum number of titles
|
|
||||||
* and populate the dropdown list
|
|
||||||
*/
|
|
||||||
function readTitlesBeginningAtIndexStartingWithPrefix(titleFile,prefix,startIndex) {
|
|
||||||
var reader = new FileReader();
|
|
||||||
reader.onerror = errorHandler;
|
|
||||||
reader.onabort = function(e) {
|
|
||||||
alert('Title file read cancelled');
|
|
||||||
};
|
|
||||||
reader.onload = function(e) {
|
|
||||||
var binaryTitleFile = e.target.result;
|
|
||||||
var byteArray = new Uint8Array(binaryTitleFile);
|
|
||||||
// Look for the index of the next NewLine
|
|
||||||
var newLineIndex=0;
|
|
||||||
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=10) {
|
|
||||||
newLineIndex++;
|
|
||||||
}
|
}
|
||||||
var i = newLineIndex;
|
}
|
||||||
var titleNumber=0;
|
|
||||||
|
/**
|
||||||
|
* Switch debug mode On/Off
|
||||||
|
*/
|
||||||
|
function switchDebugOnOff() {
|
||||||
|
if (debugOn == true) {
|
||||||
|
debugOn = false;
|
||||||
|
$('#debugZone').hide();
|
||||||
|
} else {
|
||||||
|
debugOn = true;
|
||||||
|
$('#debugZone').show();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function setLocalArchiveFromFileSelect() {
|
||||||
|
dataFiles=document.getElementById('dataFiles').files;
|
||||||
|
titleFile=document.getElementById('titleFile').files[0];
|
||||||
|
localArchive = new evopedia.LocalArchive();
|
||||||
|
localArchive.dataFiles = dataFiles;
|
||||||
|
localArchive.titleFile = titleFile;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Handle Enter key in the prefix input zone
|
||||||
|
*/
|
||||||
|
function onKeyUpPrefix(evt) {
|
||||||
|
if (evt.keyCode == 13) {
|
||||||
|
document.getElementById("searchTitles").click();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Search the index for titles that start with the given prefix (implemented
|
||||||
|
* with a binary search inside the index file)
|
||||||
|
*/
|
||||||
|
function searchTitlesFromPrefix(prefix) {
|
||||||
|
if (localArchive.titleFile) {
|
||||||
|
localArchive.findTitlesWithPrefix(prefix, populateDropDownListOfTitles);
|
||||||
|
} else {
|
||||||
|
alert("Title file not set");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Populate the drop-down list of titles with the given list
|
||||||
|
*/
|
||||||
|
function populateDropDownListOfTitles(titleList) {
|
||||||
var comboTitleList = document.getElementById('titleList');
|
var comboTitleList = document.getElementById('titleList');
|
||||||
while (i<byteArray.length && titleNumber<50) {
|
for (var i=0; i<titleList.length; i++) {
|
||||||
// Look for the index of the next NewLine
|
var title = titleList[i];
|
||||||
newLineIndex+=15;
|
comboTitleList.options[i] = new Option (title.name, title.toStringId());
|
||||||
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=10) {
|
|
||||||
newLineIndex++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Copy the encodedTitle in a new Array
|
|
||||||
var encodedTitle = new Uint8Array(newLineIndex-i);
|
|
||||||
for (var j = 0; j < newLineIndex-i; j++) {
|
|
||||||
encodedTitle[j] = byteArray[i+j];
|
|
||||||
}
|
|
||||||
|
|
||||||
var title = evopedia.Title.parseTitle(encodedTitle, new evopedia.LocalArchive(), i);
|
|
||||||
|
|
||||||
// Skip the titles that do not start with the prefix
|
|
||||||
// TODO use a normalizer to compare the strings
|
|
||||||
if (title && title.getReadableName().toLowerCase().indexOf(prefix.toLowerCase())==0) {
|
|
||||||
comboTitleList.options[titleNumber] = new Option (title.name, title.fileNr + "|" + title.blockStart + "|" + title.blockOffset + "|" + title.articleLength);
|
|
||||||
debug("Title : startIndex = " + i + " endIndex = " + newLineIndex + title.toString());
|
|
||||||
titleNumber++;
|
|
||||||
}
|
|
||||||
i=newLineIndex+1;
|
|
||||||
}
|
}
|
||||||
// Update the offsets, as if the first item of the list was selected by the user
|
}
|
||||||
updateOffsetsFromTitle($('#titleList').val());
|
|
||||||
};
|
|
||||||
var blob = titleFile.slice(startIndex);
|
|
||||||
// Read in the file as a binary string
|
|
||||||
reader.readAsArrayBuffer(blob);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Decompress and read an article in dump files
|
* Creates an instance of title from given titleId (including resolving redirects),
|
||||||
*/
|
* and call the function to read the corresponding article
|
||||||
function readArticleFromHtmlForm(dataFiles) {
|
*/
|
||||||
document.getElementById("articleContent").innerHTML="Loading article from dump...";
|
function findTitleFromTitleIdAndLaunchArticleRead(titleId) {
|
||||||
if (dataFiles && dataFiles.length>0) {
|
$("#articleContent").html("Loading article from dump...");
|
||||||
var filenumber = document.getElementById('filenumber').value;
|
if (localArchive.dataFiles && localArchive.dataFiles.length>0) {
|
||||||
var blockstart = document.getElementById('blockstart').value;
|
var title = evopedia.Title.parseTitleId(localArchive,titleId);
|
||||||
var blockoffset = document.getElementById('blockoffset').value;
|
if (title.fileNr == 255) {
|
||||||
var length = document.getElementById('length').value;
|
localArchive.resolveRedirect(title, readArticle);
|
||||||
if (filenumber==255) {
|
|
||||||
// It's a redirect : use redirected offsets
|
|
||||||
filenumber = document.getElementById('redirectfilenumber').value;
|
|
||||||
blockstart = document.getElementById('redirectblockstart').value;
|
|
||||||
blockoffset = document.getElementById('redirectblockoffset').value;
|
|
||||||
length = document.getElementById('redirectlength').value;
|
|
||||||
if (!filenumber || filenumber=="") {
|
|
||||||
// TODO : better handle this case
|
|
||||||
alert("Redirect offsets not read yet");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
var dataFile = null;
|
|
||||||
// Find the good dump file
|
|
||||||
for (var i=0; i<dataFiles.length; i++) {
|
|
||||||
var fileName = dataFiles[i].name;
|
|
||||||
var prefixedFileNumber = "";
|
|
||||||
if (filenumber<10) {
|
|
||||||
prefixedFileNumber = "0"+filenumber;
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
prefixedFileNumber = filenumber;
|
readArticle(title);
|
||||||
}
|
}
|
||||||
var expectedFileName = "wikipedia_"+prefixedFileNumber+".dat";
|
|
||||||
// Check if the fileName ends with the expected file name (in case of DeviceStorage usage, the fileName is prefixed by the directory)
|
|
||||||
if (fileName.match(expectedFileName+"$") == expectedFileName) {
|
|
||||||
dataFile = dataFiles[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!dataFile) {
|
|
||||||
alert("File number " + filenumber + " not found");
|
|
||||||
document.getElementById("articleContent").innerHTML="";
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
readArticleFromOffset(dataFile, blockstart, blockoffset, length);
|
alert("Data files not set");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
|
||||||
alert("Data files not set");
|
/**
|
||||||
|
* Read the article corresponding to the given title
|
||||||
|
*/
|
||||||
|
function readArticle(title) {
|
||||||
|
if ($.isArray(title)) {
|
||||||
|
title = title[0];
|
||||||
|
}
|
||||||
|
localArchive.readArticle(title, displayArticleInForm);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Read an article in a dump file, based on given offsets
|
|
||||||
*/
|
|
||||||
function readArticleFromOffset(dataFile, blockstart, blockoffset, length) {
|
|
||||||
|
|
||||||
var reader = new FileReader();
|
|
||||||
reader.onerror = errorHandler;
|
|
||||||
reader.onabort = function(e) {
|
|
||||||
alert('Data file read cancelled');
|
|
||||||
};
|
|
||||||
reader.onload = function(e) {
|
|
||||||
var compressedArticles = e.target.result;
|
|
||||||
//var htmlArticle = ArchUtils.bz2.decode(compressedArticles);
|
|
||||||
// TODO : should be improved by uncompressing the content chunk by chunk,
|
|
||||||
// until the length is reached, instead of uncompressing everything
|
|
||||||
var htmlArticles = bzip2.simple(bzip2.array(new Uint8Array(compressedArticles)));
|
|
||||||
// Start reading at offset, and keep length characters
|
|
||||||
var htmlArticle = htmlArticles.substring(blockoffset,blockoffset+length);
|
|
||||||
// Keep only length characters
|
|
||||||
htmlArticle = htmlArticle.substring(0,length);
|
|
||||||
// Decode UTF-8 encoding
|
|
||||||
htmlArticle = decodeURIComponent(escape(htmlArticle));
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Display the the given HTML article in the web page,
|
||||||
|
* and convert links to javascript calls
|
||||||
|
*/
|
||||||
|
function displayArticleInForm(htmlArticle) {
|
||||||
// Display the article inside the web page.
|
// Display the article inside the web page.
|
||||||
$('#articleContent').html(htmlArticle);
|
$('#articleContent').html(htmlArticle);
|
||||||
|
|
||||||
// Convert links into javascript calls
|
// Convert links into javascript calls
|
||||||
$('#articleContent').find('a').each(function(){
|
$('#articleContent').find('a').each(function(){
|
||||||
// Store current link's url
|
// Store current link's url
|
||||||
var url = $(this).attr("href");
|
var url = $(this).attr("href");
|
||||||
|
|
||||||
if(url.slice(0, 1) == "#") {
|
if(url.slice(0, 1) == "#") {
|
||||||
// It's an anchor link : do nothing
|
// It's an anchor link : do nothing
|
||||||
}
|
}
|
||||||
else if (url.substring(0,4) === "http") {
|
else if (url.substring(0,4) === "http") {
|
||||||
// It's an external link : do nothing
|
// It's an external link : do nothing
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
// It's a link to another article : add an onclick event to go to this article
|
// It's a link to another article : add an onclick event to go to this article
|
||||||
// instead of following the link
|
// instead of following the link
|
||||||
$(this).on('click', function(e) {
|
$(this).on('click', function(e) {
|
||||||
goToArticle($(this).attr("href"));
|
goToArticle($(this).attr("href"));
|
||||||
return false;
|
return false;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
});
|
||||||
});
|
|
||||||
};
|
|
||||||
|
|
||||||
// TODO : should be improved by reading the file chunks by chunks until the article is found,
|
|
||||||
// instead of reading the whole file starting at blockstart
|
|
||||||
var blob = dataFile.slice(blockstart);
|
|
||||||
|
|
||||||
// Read in the image file as a binary string.
|
|
||||||
reader.readAsArrayBuffer(blob);
|
|
||||||
}
|
|
||||||
|
|
||||||
function errorHandler(evt) {
|
|
||||||
switch(evt.target.error.code) {
|
|
||||||
case evt.target.error.NOT_FOUND_ERR:
|
|
||||||
alert('File Not Found!');
|
|
||||||
break;
|
|
||||||
case evt.target.error.NOT_READABLE_ERR:
|
|
||||||
alert('File is not readable');
|
|
||||||
break;
|
|
||||||
case evt.target.error.ABORT_ERR:
|
|
||||||
break; // noop
|
|
||||||
default:
|
|
||||||
alert('An error occurred reading this file.');
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
function handleDataFileSelect(evt) {
|
|
||||||
dataFiles = evt.target.files;
|
|
||||||
}
|
|
||||||
|
|
||||||
function handleTitleFileSelect(evt) {
|
|
||||||
titleFile = evt.target.files[0];
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Handle Enter key in the prefix input zone
|
|
||||||
*/
|
|
||||||
function onKeyUpPrefix(evt) {
|
|
||||||
if (evt.keyCode == 13) {
|
|
||||||
document.getElementById("searchTitles").click();
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Replace article content with the one of the given title
|
/**
|
||||||
*/
|
* Replace article content with the one of the given title
|
||||||
function goToArticle(title) {
|
*/
|
||||||
// This is awful and does not work very well.
|
function goToArticle(title) {
|
||||||
// It's just temporary before the algorithm is rewritten in an object-oriented way
|
$("#articleContent").html("Loading article from dump...");
|
||||||
// TODO : rewrite this with a real article search and display
|
localArchive.getTitleByName(title, readArticle);
|
||||||
searchTitlesFromPrefix(titleFile,title);
|
}
|
||||||
updateOffsetsFromTitle($('#titleList').val());
|
|
||||||
document.getElementById("articleContent").innerHTML="";
|
|
||||||
}
|
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -1,5 +1,9 @@
|
|||||||
define(function(require) {
|
define(function(require) {
|
||||||
|
|
||||||
|
// Module dependencies
|
||||||
|
var remove_diacritics = require('remove_diacritics');
|
||||||
|
var bzip2 = require('bzip2');
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Read an integer encoded in 4 bytes
|
* Read an integer encoded in 4 bytes
|
||||||
*/
|
*/
|
||||||
@ -37,11 +41,263 @@ define(function(require) {
|
|||||||
* It's still minimal for now. TODO : complete implementation to handle maths and coordinates
|
* It's still minimal for now. TODO : complete implementation to handle maths and coordinates
|
||||||
*/
|
*/
|
||||||
function LocalArchive() {
|
function LocalArchive() {
|
||||||
this.directory = null;
|
this.dataFiles = new Array();
|
||||||
this.titleFile = null;
|
this.titleFile = null;
|
||||||
this.date = null;
|
// TODO to be replaced by the real archive attributes
|
||||||
this.language = null;
|
this.date = "2013-03-14";
|
||||||
}
|
this.language = "zz";
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This function is recursively called after each asynchronous read,
|
||||||
|
* so that to find the closest index in titleFile to the given prefix
|
||||||
|
* When found, call the callbackFunction with the index
|
||||||
|
* @param reader
|
||||||
|
* @param prefix
|
||||||
|
* @param lo
|
||||||
|
* @param hi
|
||||||
|
* @param callbackFunction
|
||||||
|
*/
|
||||||
|
LocalArchive.prototype.recursivePrefixSearch = function(reader, prefix, lo, hi, callbackFunction) {
|
||||||
|
if (lo < hi-1 ) {
|
||||||
|
var mid = Math.round((lo+hi)/2);
|
||||||
|
// TODO : improve the way we read this file : 128 bytes is arbitrary and might be too small
|
||||||
|
var blob = this.titleFile.slice(mid,mid+128);
|
||||||
|
var currentLocalArchiveInstance = this;
|
||||||
|
reader.onload = function(e) {
|
||||||
|
var binaryTitleFile = e.target.result;
|
||||||
|
var byteArray = new Uint8Array(binaryTitleFile);
|
||||||
|
// Look for the index of the next NewLine
|
||||||
|
var newLineIndex=0;
|
||||||
|
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=10) {
|
||||||
|
newLineIndex++;
|
||||||
|
}
|
||||||
|
var i = newLineIndex+1;
|
||||||
|
newLineIndex = i+15;
|
||||||
|
// Look for the index of the next NewLine
|
||||||
|
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=10) {
|
||||||
|
newLineIndex++;
|
||||||
|
}
|
||||||
|
var title = utf8ByteArrayToString(byteArray,i+15,newLineIndex);
|
||||||
|
if (title.localeCompare(prefix)<0) {
|
||||||
|
lo = mid;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
hi = mid;
|
||||||
|
}
|
||||||
|
currentLocalArchiveInstance.recursivePrefixSearch(reader, prefix, lo, hi, callbackFunction);
|
||||||
|
};
|
||||||
|
// Read the file as a binary string
|
||||||
|
reader.readAsArrayBuffer(blob);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// We found the closest title at index lo
|
||||||
|
callbackFunction(lo);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Look for a title in the title file at the given offset, and call the callbackFunction with this Title
|
||||||
|
* @param titleOffset
|
||||||
|
* @param callbackFunction
|
||||||
|
*/
|
||||||
|
LocalArchive.prototype.getTitleAtOffset = function(titleOffset, callbackFunction) {
|
||||||
|
this.getTitlesStartingAtOffset(titleOffset, 1, callbackFunction);
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read the titles in the title file starting at the given offset (maximum titleCount), and call the callbackFunction with this list of Title instances
|
||||||
|
* @param titleOffset
|
||||||
|
* @param titleCount maximum number of titles to retrieve
|
||||||
|
* @param callbackFunction
|
||||||
|
*/
|
||||||
|
LocalArchive.prototype.getTitlesStartingAtOffset = function(titleOffset, titleCount, callbackFunction) {
|
||||||
|
var reader = new FileReader();
|
||||||
|
reader.onerror = errorHandler;
|
||||||
|
reader.onabort = function(e) {
|
||||||
|
alert('Title file read cancelled');
|
||||||
|
};
|
||||||
|
|
||||||
|
var currentLocalArchiveInstance = this;
|
||||||
|
reader.onload = function(e) {
|
||||||
|
var binaryTitleFile = e.target.result;
|
||||||
|
var byteArray = new Uint8Array(binaryTitleFile);
|
||||||
|
// Look for the index of the next NewLine
|
||||||
|
var newLineIndex=0;
|
||||||
|
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=10) {
|
||||||
|
newLineIndex++;
|
||||||
|
}
|
||||||
|
var i = newLineIndex;
|
||||||
|
var titleNumber=-1;
|
||||||
|
var titleList = new Array();
|
||||||
|
while (i<byteArray.length && titleNumber<titleCount) {
|
||||||
|
// Look for the index of the next NewLine
|
||||||
|
newLineIndex+=15;
|
||||||
|
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=10) {
|
||||||
|
newLineIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Copy the encodedTitle in a new Array
|
||||||
|
var encodedTitle = new Uint8Array(newLineIndex-i);
|
||||||
|
for (var j = 0; j < newLineIndex-i; j++) {
|
||||||
|
encodedTitle[j] = byteArray[i+j];
|
||||||
|
}
|
||||||
|
|
||||||
|
var title = Title.parseTitle(encodedTitle, currentLocalArchiveInstance, i);
|
||||||
|
|
||||||
|
// Skip the titles that do not start with the prefix
|
||||||
|
// TODO use a normalizer to compare the strings
|
||||||
|
// TODO see why we need to skip the first title
|
||||||
|
//if (title && title.getReadableName().toLowerCase().indexOf(prefix.toLowerCase())==0) {
|
||||||
|
if (titleNumber>=0) {
|
||||||
|
titleList[titleNumber] = title;
|
||||||
|
}
|
||||||
|
titleNumber++;
|
||||||
|
i=newLineIndex+1;
|
||||||
|
}
|
||||||
|
callbackFunction(titleList);
|
||||||
|
};
|
||||||
|
var blob = this.titleFile.slice(titleOffset);
|
||||||
|
// Read in the file as a binary string
|
||||||
|
reader.readAsArrayBuffer(blob);
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Look for a title by its name, and call the callbackFunction with this Title
|
||||||
|
* @param titleName
|
||||||
|
* @param callbackFunction
|
||||||
|
*/
|
||||||
|
LocalArchive.prototype.getTitleByName = function(titleName, callbackFunction) {
|
||||||
|
var titleFileSize = this.titleFile.size;
|
||||||
|
var reader = new FileReader();
|
||||||
|
reader.onerror = errorHandler;
|
||||||
|
reader.onabort = function(e) {
|
||||||
|
alert('Title file read cancelled');
|
||||||
|
};
|
||||||
|
var currentLocalArchiveInstance = this;
|
||||||
|
this.recursivePrefixSearch(reader, titleName, 0, titleFileSize, function(titleOffset) {
|
||||||
|
currentLocalArchiveInstance.getTitleAtOffset(titleOffset, callbackFunction);
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a random title, and call the callbackFunction with this Title
|
||||||
|
* @param callbackFunction
|
||||||
|
*/
|
||||||
|
LocalArchive.prototype.getRandomTitle = function(callbackFunction) {
|
||||||
|
// TODO to be implemented
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find the 50 titles that start with the given prefix, and call the callbackFunction with this list of Titles
|
||||||
|
* @param prefix
|
||||||
|
* @param callbackFunction
|
||||||
|
*/
|
||||||
|
LocalArchive.prototype.findTitlesWithPrefix = function(prefix, callbackFunction) {
|
||||||
|
var titleFileSize = this.titleFile.size;
|
||||||
|
if (prefix) {
|
||||||
|
prefix = remove_diacritics.normalizeString(prefix);
|
||||||
|
}
|
||||||
|
|
||||||
|
var reader = new FileReader();
|
||||||
|
reader.onerror = errorHandler;
|
||||||
|
reader.onabort = function(e) {
|
||||||
|
alert('Title file read cancelled');
|
||||||
|
};
|
||||||
|
var currentLocalArchiveInstance = this;
|
||||||
|
this.recursivePrefixSearch(reader, prefix, 0, titleFileSize, function(titleOffset) {
|
||||||
|
currentLocalArchiveInstance.getTitlesStartingAtOffset(titleOffset, 50, callbackFunction);
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read an article from the title instance, and call the callbackFunction with the article HTML String
|
||||||
|
* @param title
|
||||||
|
* @param callbackFunction
|
||||||
|
*/
|
||||||
|
LocalArchive.prototype.readArticle = function(title, callbackFunction) {
|
||||||
|
var dataFile = null;
|
||||||
|
|
||||||
|
var prefixedFileNumber = "";
|
||||||
|
if (title.fileNr<10) {
|
||||||
|
prefixedFileNumber = "0" + title.fileNr;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
prefixedFileNumber = title.fileNr;
|
||||||
|
}
|
||||||
|
var expectedFileName = "wikipedia_"+prefixedFileNumber+".dat";
|
||||||
|
|
||||||
|
// Find the good dump file
|
||||||
|
for (var i=0; i<this.dataFiles.length; i++) {
|
||||||
|
var fileName = this.dataFiles[i].name;
|
||||||
|
// Check if the fileName ends with the expected file name (in case of DeviceStorage usage, the fileName is prefixed by the directory)
|
||||||
|
if (fileName.match(expectedFileName+"$") == expectedFileName) {
|
||||||
|
dataFile = this.dataFiles[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!dataFile) {
|
||||||
|
throw "File number " + title.fileNr + " not found";
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
var reader = new FileReader();
|
||||||
|
reader.onerror = errorHandler;
|
||||||
|
reader.onabort = function(e) {
|
||||||
|
alert('Data file read cancelled');
|
||||||
|
};
|
||||||
|
reader.onload = function(e) {
|
||||||
|
var compressedArticles = e.target.result;
|
||||||
|
//var htmlArticle = ArchUtils.bz2.decode(compressedArticles);
|
||||||
|
// TODO : should be improved by uncompressing the content chunk by chunk,
|
||||||
|
// until the length is reached, instead of uncompressing everything
|
||||||
|
var htmlArticles = bzip2.simple(bzip2.array(new Uint8Array(compressedArticles)));
|
||||||
|
// Start reading at offset, and keep length characters
|
||||||
|
var htmlArticle = htmlArticles.substring(title.blockOffset,title.blockOffset + title.articleLength);
|
||||||
|
// Keep only length characters
|
||||||
|
htmlArticle = htmlArticle.substring(0,title.articleLength);
|
||||||
|
// Decode UTF-8 encoding
|
||||||
|
htmlArticle = decodeURIComponent(escape(htmlArticle));
|
||||||
|
|
||||||
|
callbackFunction (htmlArticle);
|
||||||
|
};
|
||||||
|
|
||||||
|
// TODO : should be improved by reading the file chunks by chunks until the article is found,
|
||||||
|
// instead of reading the whole file starting at blockstart
|
||||||
|
var blob = dataFile.slice(title.blockStart);
|
||||||
|
|
||||||
|
// Read in the image file as a binary string.
|
||||||
|
reader.readAsArrayBuffer(blob);
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resolve the redirect of the given title instance, and call the callbackFunction with the redirected Title instance
|
||||||
|
* @param title
|
||||||
|
* @param callbackFunction
|
||||||
|
*/
|
||||||
|
LocalArchive.prototype.resolveRedirect = function(title, callbackFunction) {
|
||||||
|
var reader = new FileReader();
|
||||||
|
reader.onerror = errorHandler;
|
||||||
|
reader.onabort = function(e) {
|
||||||
|
alert('Title file read cancelled');
|
||||||
|
};
|
||||||
|
reader.onload = function(e) {
|
||||||
|
var binaryTitleFile = e.target.result;
|
||||||
|
var byteArray = new Uint8Array(binaryTitleFile);
|
||||||
|
|
||||||
|
var redirectedTitle = title;
|
||||||
|
redirectedTitle.fileNr = byteArray[2];
|
||||||
|
redirectedTitle.blockStart = readIntegerFrom4Bytes(byteArray,3);
|
||||||
|
redirectedTitle.blockOffset = readIntegerFrom4Bytes(byteArray,7);
|
||||||
|
redirectedTitle.articleLength = readIntegerFrom4Bytes(byteArray,11);
|
||||||
|
|
||||||
|
callbackFunction(redirectedTitle);
|
||||||
|
};
|
||||||
|
// Read only the 16 necessary bytes, starting at title.blockStart
|
||||||
|
var blob = titleFile.slice(title.blockStart,title.blockStart+16);
|
||||||
|
// Read in the file as a binary string
|
||||||
|
reader.readAsArrayBuffer(blob);
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -125,13 +381,57 @@ define(function(require) {
|
|||||||
return utf8ByteArrayToString(encodedTitle, 15, len);
|
return utf8ByteArrayToString(encodedTitle, 15, len);
|
||||||
};
|
};
|
||||||
|
|
||||||
Title.prototype.toStringId = function(){
|
/**
|
||||||
return this.archive.language + "_" + this.archive.date + "_" + this.titleOffset;
|
* Creates a title instance from a serialized id
|
||||||
|
*/
|
||||||
|
Title.parseTitleId = function(localArchive, titleId) {
|
||||||
|
var title = new Title();
|
||||||
|
var idParts = titleId.split("|");
|
||||||
|
title.archive = localArchive;
|
||||||
|
title.fileNr = idParts[2];
|
||||||
|
title.titleOffset = idParts[3];
|
||||||
|
title.name = idParts[4];
|
||||||
|
title.blockStart = idParts[5];
|
||||||
|
title.blockOffset = idParts[6];
|
||||||
|
title.articleLength = idParts[7];
|
||||||
|
return title;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Serialize the title with its values
|
||||||
|
* @returns {String}
|
||||||
|
*/
|
||||||
|
Title.prototype.toStringId = function(){
|
||||||
|
return this.archive.language + "|" + this.archive.date + "|" + this.fileNr + "|"
|
||||||
|
+ this.titleOffset + "|" + this.name + "|" + this.blockStart + "|" + this.blockOffset + "|" + this.articleLength ;
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Serialize the title in a readable way
|
||||||
|
*/
|
||||||
Title.prototype.toString = function(){
|
Title.prototype.toString = function(){
|
||||||
return "title.id = " + this.toStringId() + "title.name = " + this.name + " title.fileNr = " + this.fileNr + " title.blockStart = " + this.blockStart + " title.blockOffset = " + this.blockOffset + " title.articleLength = " + this.articleLength;
|
return "title.id = " + this.toStringId() + "title.name = " + this.name + " title.fileNr = " + this.fileNr + " title.blockStart = " + this.blockStart + " title.blockOffset = " + this.blockOffset + " title.articleLength = " + this.articleLength;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* ErrorHandler for FileReader
|
||||||
|
*/
|
||||||
|
function errorHandler(evt) {
|
||||||
|
switch(evt.target.error.code) {
|
||||||
|
case evt.target.error.NOT_FOUND_ERR:
|
||||||
|
alert('File Not Found!');
|
||||||
|
break;
|
||||||
|
case evt.target.error.NOT_READABLE_ERR:
|
||||||
|
alert('File is not readable');
|
||||||
|
break;
|
||||||
|
case evt.target.error.ABORT_ERR:
|
||||||
|
break; // noop
|
||||||
|
default:
|
||||||
|
alert('An error occurred reading this file.');
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Functions and classes exposed by this module
|
* Functions and classes exposed by this module
|
||||||
*/
|
*/
|
||||||
|
Loading…
x
Reference in New Issue
Block a user