mirror of
https://github.com/kiwix/kiwix-js.git
synced 2025-09-22 12:01:15 -04:00
Big refactoring of the code, in a more object-oriented way.
But I could not push it as much as I would like : the FileReader API is asynchronous, which prevents some implementations (ex : a constructor in a class cannot read files) The title searching is not very accurate for now, which is why the links do not work very well for now
This commit is contained in:
parent
6c5020b42a
commit
7802cbf3b8
@ -48,16 +48,16 @@ License:
|
||||
<br />
|
||||
To use it, you have to first download locally a dump from <a href="http://dumpathome.evopedia.info/dumps/finished">http://dumpathome.evopedia.info/dumps/finished</a> (with a Bittorrent client), and select some of the dowloaded files below.
|
||||
<br />
|
||||
Current status : I have tested it with the <a href="http://evopedia.info/dumps/wikipedia_small_2010-08-14.torrent">small dump (2010-08-14)</a>, the <a href="http://evopedia.info/dumps/wikipedia_fr_2012-02-03.torrent">French dump (2012-02-03)</a>, the <a href="http://evopedia.info/dumps/wikipedia_frwiktionary_2011-03-16.torrent">French wiktionary dump (2011-03-16)</a> and the <a href="http://evopedia.info/dumps/wikipedia_en_2012-02-11.torrent">English dump (2012-02-11)</a>
|
||||
I have tested it with the <a href="http://evopedia.info/dumps/wikipedia_small_2010-08-14.torrent">small dump (2010-08-14)</a>, the <a href="http://evopedia.info/dumps/wikipedia_fr_2012-02-03.torrent">French dump (2012-02-03)</a>, the <a href="http://evopedia.info/dumps/wikipedia_frwiktionary_2011-03-16.torrent">French wiktionary dump (2011-03-16)</a> and the <a href="http://evopedia.info/dumps/wikipedia_en_2012-02-11.torrent">English dump (2012-02-11)</a>
|
||||
<br />
|
||||
<br />
|
||||
<ul>
|
||||
<li>On desktops, it works at least on recent Firefox and Chrome</li>
|
||||
<li>On desktops, it works on recent Firefox and Chrome, and maybe on other browsers</li>
|
||||
<li>On the Firefos OS simulator, you have (for now) to put the small dump files in a "fake-sdcard" folder of your firefox profile (ex : ~/.mozilla/firefox/xxxx.default/extensions/r2d2b2g@mozilla.org/profile/fake-sdcard). It looks for wikipedia_small_2010-08-14/titles.idx in it. You also need to install the application from the dashboard of the simulator instead of accessing via the browser (due to security restrictions in Firefox OS : only certified webapps can access the sdcard)</li>
|
||||
<li>I could not test it on a real Firefox OS device : if someone did, please let me know</li>
|
||||
</ul>
|
||||
<br />
|
||||
It's only a proof of concept sor far : there are certainly many many ways this could be enhanced (suggestions and patches are welcome : the source code is on <a href="https://github.com/mossroy/evopedia-html5">github</a>). In particular, the performance can be optimized when reading an article. I also know the links inside an article do not work very well for now.
|
||||
It's only a proof of concept so far : there are many many ways this could be enhanced (suggestions and patches are welcome : the source code is on <a href="https://github.com/mossroy/evopedia-html5">github</a>). In particular, the performance can be optimized when reading an article. I also know the searches are not always very accurate, and the links inside an article do not work well for now.
|
||||
<br />
|
||||
<div id="openLocalFiles" style="display: none;">
|
||||
<br /> Please select the file titles.idx :<br /> <input type="file"
|
||||
|
545
www/js/app.js
545
www/js/app.js
@ -18,419 +18,200 @@ define(function(require) {
|
||||
require('./install-button');
|
||||
|
||||
// Evopedia javascript dependencies
|
||||
var bzip2 = require('bzip2');
|
||||
var remove_diacritics = require('remove_diacritics');
|
||||
var evopedia = require('evopedia');
|
||||
|
||||
|
||||
var dataFiles=document.getElementById('dataFiles').files;
|
||||
var titleFile=document.getElementById('titleFile').files[0];
|
||||
var localArchive = null;
|
||||
setLocalArchiveFromFileSelect();
|
||||
|
||||
// Define behavior of HTML elements
|
||||
$('#searchTitles').on('click', function(e) {
|
||||
searchTitlesFromPrefix(titleFile,$('#prefix').val());
|
||||
});
|
||||
$('#titleList').on('change', function(e) {
|
||||
updateOffsetsFromTitle(this.value);
|
||||
});
|
||||
$('#toggleDebug').on('click', function(e) {
|
||||
switchDebugOnOff();
|
||||
});
|
||||
$('#readData').on('click', function(e) {
|
||||
readArticleFromHtmlForm(dataFiles);
|
||||
});
|
||||
$('#prefix').on('keyup', function(e) {
|
||||
onKeyUpPrefix(e);
|
||||
});
|
||||
// Define behavior of HTML elements
|
||||
$('#searchTitles').on('click', function(e) {
|
||||
searchTitlesFromPrefix($('#prefix').val());
|
||||
});
|
||||
$('#toggleDebug').on('click', function(e) {
|
||||
switchDebugOnOff();
|
||||
});
|
||||
$('#readData').on('click', function(e) {
|
||||
findTitleFromTitleIdAndLaunchArticleRead($('#titleList').val());
|
||||
});
|
||||
$('#prefix').on('keyup', function(e) {
|
||||
onKeyUpPrefix(e);
|
||||
});
|
||||
|
||||
|
||||
// Detect if DeviceStorage is available
|
||||
var storage = null;
|
||||
if ($.isFunction(navigator.getDeviceStorage)) {
|
||||
storage = navigator.getDeviceStorage('sdcard');
|
||||
}
|
||||
// Detect if DeviceStorage is available
|
||||
var storage = null;
|
||||
if ($.isFunction(navigator.getDeviceStorage)) {
|
||||
storage = navigator.getDeviceStorage('sdcard');
|
||||
}
|
||||
|
||||
if (storage != null) {
|
||||
var filerequest = storage.get('wikipedia_small_2010-08-14/wikipedia_00.dat');
|
||||
filerequest.onsuccess = function() {
|
||||
dataFiles = [];
|
||||
dataFiles[0] = filerequest.result;
|
||||
filerequest = storage.get('wikipedia_small_2010-08-14/titles.idx');
|
||||
filerequest.onsuccess = function() {
|
||||
titleFile = filerequest.result;
|
||||
};
|
||||
filerequest.onerror = function(event) {
|
||||
alert("error reading title file : " + event.target.error.name);
|
||||
};
|
||||
};
|
||||
filerequest.onerror = function(event) {
|
||||
alert("error reading data file : " + event.target.error.name);
|
||||
};
|
||||
}
|
||||
else {
|
||||
displayFileSelect();
|
||||
}
|
||||
if (storage != null) {
|
||||
var filerequest = storage.get('wikipedia_small_2010-08-14/wikipedia_00.dat');
|
||||
filerequest.onsuccess = function() {
|
||||
localArchive = new evopedia.LocalArchive();
|
||||
localArchive.dataFiles[0] = filerequest.result;
|
||||
filerequest = storage.get('wikipedia_small_2010-08-14/titles.idx');
|
||||
filerequest.onsuccess = function() {
|
||||
localArchive.titleFile = filerequest.result;
|
||||
};
|
||||
filerequest.onerror = function(event) {
|
||||
alert("error reading title file : " + event.target.error.name);
|
||||
};
|
||||
};
|
||||
filerequest.onerror = function(event) {
|
||||
alert("error reading data file : " + event.target.error.name);
|
||||
};
|
||||
}
|
||||
else {
|
||||
displayFileSelect();
|
||||
}
|
||||
|
||||
/**
|
||||
* Displays the zone to select files from the dump
|
||||
*/
|
||||
function displayFileSelect() {
|
||||
$('#openLocalFiles').show();
|
||||
document.getElementById('dataFiles').addEventListener('change', handleDataFileSelect, false);
|
||||
document.getElementById('titleFile').addEventListener('change', handleTitleFileSelect, false);
|
||||
}
|
||||
|
||||
var debugOn = false;
|
||||
|
||||
/**
|
||||
* Print the given string inside the debug zone
|
||||
* @param string
|
||||
*/
|
||||
function debug(string) {
|
||||
if (debugOn) {
|
||||
document.getElementById("debugTextarea").value+=string+"\n";
|
||||
/**
|
||||
* Displays the zone to select files from the dump
|
||||
*/
|
||||
function displayFileSelect() {
|
||||
$('#openLocalFiles').show();
|
||||
$('#dataFiles').on('change', setLocalArchiveFromFileSelect);
|
||||
$('#titleFile').on('change', setLocalArchiveFromFileSelect);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Switch debug mode On/Off
|
||||
*/
|
||||
function switchDebugOnOff() {
|
||||
if (debugOn == true) {
|
||||
debugOn = false;
|
||||
$('#debugZone').hide();
|
||||
}
|
||||
else {
|
||||
debugOn = true;
|
||||
$('#debugZone').show();
|
||||
}
|
||||
}
|
||||
var debugOn = false;
|
||||
|
||||
/**
|
||||
* Set the Offsets HTML fields from the selected title
|
||||
*/
|
||||
function updateOffsetsFromTitle(selectValue) {
|
||||
var offsets=selectValue.split(/\|/);
|
||||
document.getElementById("filenumber").value=offsets[0];
|
||||
document.getElementById("blockstart").value=offsets[1];
|
||||
document.getElementById("blockoffset").value=offsets[2];
|
||||
document.getElementById("length").value=offsets[3];
|
||||
if (offsets[0]==255) {
|
||||
// It's a redirect : find out the real offsets (asynchronous read)
|
||||
readRedirectOffsets(titleFile,offsets[1]);
|
||||
}
|
||||
else {
|
||||
document.getElementById('redirectfilenumber').value = "";
|
||||
document.getElementById('redirectblockstart').value = "";
|
||||
document.getElementById('redirectblockoffset').value = "";
|
||||
document.getElementById('redirectlength').value = "";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This function is recursively called after each asynchronous read,
|
||||
* so that to find the closest index in titleFile to the given prefix
|
||||
*/
|
||||
function recursivePrefixSearch(titleFile, reader, prefix, lo, hi) {
|
||||
if (lo < hi-1 ) {
|
||||
var mid = Math.round((lo+hi)/2);
|
||||
// TODO : improve the way we read this file : 256 bytes is arbitrary and might be too small
|
||||
var blob = titleFile.slice(mid,mid+256);
|
||||
reader.onload = function(e) {
|
||||
var binaryTitleFile = e.target.result;
|
||||
var byteArray = new Uint8Array(binaryTitleFile);
|
||||
// Look for the index of the next NewLine
|
||||
var newLineIndex=0;
|
||||
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=10) {
|
||||
newLineIndex++;
|
||||
}
|
||||
var i = newLineIndex+1;
|
||||
newLineIndex = i+15;
|
||||
// Look for the index of the next NewLine
|
||||
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=10) {
|
||||
newLineIndex++;
|
||||
}
|
||||
var title = evopedia.utf8ByteArrayToString(byteArray,i+15,newLineIndex);
|
||||
debug("title found : "+title);
|
||||
if (title.localeCompare(prefix)<0) {
|
||||
lo = mid;
|
||||
}
|
||||
else {
|
||||
hi = mid;
|
||||
}
|
||||
recursivePrefixSearch(titleFile, reader, prefix, lo, hi);
|
||||
};
|
||||
debug("Reading the file from "+mid+" to "+(mid+256)+" because lo="+lo+" and hi="+hi);
|
||||
// Read the file as a binary string
|
||||
reader.readAsArrayBuffer(blob);
|
||||
}
|
||||
else {
|
||||
// We found the closest title
|
||||
debug ("Found the closest title near index "+lo);
|
||||
readTitlesBeginningAtIndexStartingWithPrefix(titleFile,prefix,lo);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Search the index for titles that start with the given prefix
|
||||
* (implemented with a binary search inside the index file)
|
||||
*/
|
||||
function searchTitlesFromPrefix(titleFile, prefix) {
|
||||
if (titleFile) {
|
||||
var titleFileSize = titleFile.size;
|
||||
prefix = remove_diacritics.normalizeString(prefix);
|
||||
|
||||
var reader = new FileReader();
|
||||
reader.onerror = errorHandler;
|
||||
reader.onabort = function(e) {
|
||||
alert('Title file read cancelled');
|
||||
};
|
||||
recursivePrefixSearch(titleFile, reader, prefix, 0, titleFileSize);
|
||||
}
|
||||
else {
|
||||
alert ("Title file not set");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the real offsets when a redirect was found, based on the redirectIndex provided
|
||||
* The file read is asynchronous, and populates the html form as soon as the offsets are found
|
||||
* @param titleFile
|
||||
* @param redirectIndex
|
||||
*/
|
||||
function readRedirectOffsets(titleFile,redirectIndex) {
|
||||
var reader = new FileReader();
|
||||
reader.onerror = errorHandler;
|
||||
reader.onabort = function(e) {
|
||||
alert('Title file read cancelled');
|
||||
};
|
||||
reader.onload = function(e) {
|
||||
var binaryTitleFile = e.target.result;
|
||||
var byteArray = new Uint8Array(binaryTitleFile);
|
||||
var filenumber = byteArray[2];
|
||||
|
||||
var blockstart = evopedia.readIntegerFrom4Bytes(byteArray,3);
|
||||
var blockoffset = evopedia.readIntegerFrom4Bytes(byteArray,7);
|
||||
var length = evopedia.readIntegerFrom4Bytes(byteArray,11);
|
||||
|
||||
document.getElementById('redirectfilenumber').value = filenumber;
|
||||
document.getElementById('redirectblockstart').value = blockstart;
|
||||
document.getElementById('redirectblockoffset').value = blockoffset;
|
||||
document.getElementById('redirectlength').value = length;
|
||||
};
|
||||
// Read only the 16 necessary bytes
|
||||
var blob = titleFile.slice(redirectIndex,redirectIndex+16);
|
||||
// Read in the file as a binary string
|
||||
reader.readAsArrayBuffer(blob);
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the titles following the given index in the title file, until one of the following conditions is reached :
|
||||
* - the title does not start with the prefix anymore
|
||||
* - we already read the maximum number of titles
|
||||
* and populate the dropdown list
|
||||
*/
|
||||
function readTitlesBeginningAtIndexStartingWithPrefix(titleFile,prefix,startIndex) {
|
||||
var reader = new FileReader();
|
||||
reader.onerror = errorHandler;
|
||||
reader.onabort = function(e) {
|
||||
alert('Title file read cancelled');
|
||||
};
|
||||
reader.onload = function(e) {
|
||||
var binaryTitleFile = e.target.result;
|
||||
var byteArray = new Uint8Array(binaryTitleFile);
|
||||
// Look for the index of the next NewLine
|
||||
var newLineIndex=0;
|
||||
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=10) {
|
||||
newLineIndex++;
|
||||
/**
|
||||
* Print the given string inside the debug zone
|
||||
*
|
||||
* @param string
|
||||
*/
|
||||
function debug(string) {
|
||||
if (debugOn) {
|
||||
document.getElementById("debugTextarea").value += string + "\n";
|
||||
}
|
||||
var i = newLineIndex;
|
||||
var titleNumber=0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Switch debug mode On/Off
|
||||
*/
|
||||
function switchDebugOnOff() {
|
||||
if (debugOn == true) {
|
||||
debugOn = false;
|
||||
$('#debugZone').hide();
|
||||
} else {
|
||||
debugOn = true;
|
||||
$('#debugZone').show();
|
||||
}
|
||||
}
|
||||
|
||||
function setLocalArchiveFromFileSelect() {
|
||||
dataFiles=document.getElementById('dataFiles').files;
|
||||
titleFile=document.getElementById('titleFile').files[0];
|
||||
localArchive = new evopedia.LocalArchive();
|
||||
localArchive.dataFiles = dataFiles;
|
||||
localArchive.titleFile = titleFile;
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle Enter key in the prefix input zone
|
||||
*/
|
||||
function onKeyUpPrefix(evt) {
|
||||
if (evt.keyCode == 13) {
|
||||
document.getElementById("searchTitles").click();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Search the index for titles that start with the given prefix (implemented
|
||||
* with a binary search inside the index file)
|
||||
*/
|
||||
function searchTitlesFromPrefix(prefix) {
|
||||
if (localArchive.titleFile) {
|
||||
localArchive.findTitlesWithPrefix(prefix, populateDropDownListOfTitles);
|
||||
} else {
|
||||
alert("Title file not set");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Populate the drop-down list of titles with the given list
|
||||
*/
|
||||
function populateDropDownListOfTitles(titleList) {
|
||||
var comboTitleList = document.getElementById('titleList');
|
||||
while (i<byteArray.length && titleNumber<50) {
|
||||
// Look for the index of the next NewLine
|
||||
newLineIndex+=15;
|
||||
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=10) {
|
||||
newLineIndex++;
|
||||
}
|
||||
|
||||
// Copy the encodedTitle in a new Array
|
||||
var encodedTitle = new Uint8Array(newLineIndex-i);
|
||||
for (var j = 0; j < newLineIndex-i; j++) {
|
||||
encodedTitle[j] = byteArray[i+j];
|
||||
}
|
||||
|
||||
var title = evopedia.Title.parseTitle(encodedTitle, new evopedia.LocalArchive(), i);
|
||||
|
||||
// Skip the titles that do not start with the prefix
|
||||
// TODO use a normalizer to compare the strings
|
||||
if (title && title.getReadableName().toLowerCase().indexOf(prefix.toLowerCase())==0) {
|
||||
comboTitleList.options[titleNumber] = new Option (title.name, title.fileNr + "|" + title.blockStart + "|" + title.blockOffset + "|" + title.articleLength);
|
||||
debug("Title : startIndex = " + i + " endIndex = " + newLineIndex + title.toString());
|
||||
titleNumber++;
|
||||
}
|
||||
i=newLineIndex+1;
|
||||
for (var i=0; i<titleList.length; i++) {
|
||||
var title = titleList[i];
|
||||
comboTitleList.options[i] = new Option (title.name, title.toStringId());
|
||||
}
|
||||
// Update the offsets, as if the first item of the list was selected by the user
|
||||
updateOffsetsFromTitle($('#titleList').val());
|
||||
};
|
||||
var blob = titleFile.slice(startIndex);
|
||||
// Read in the file as a binary string
|
||||
reader.readAsArrayBuffer(blob);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Decompress and read an article in dump files
|
||||
*/
|
||||
function readArticleFromHtmlForm(dataFiles) {
|
||||
document.getElementById("articleContent").innerHTML="Loading article from dump...";
|
||||
if (dataFiles && dataFiles.length>0) {
|
||||
var filenumber = document.getElementById('filenumber').value;
|
||||
var blockstart = document.getElementById('blockstart').value;
|
||||
var blockoffset = document.getElementById('blockoffset').value;
|
||||
var length = document.getElementById('length').value;
|
||||
if (filenumber==255) {
|
||||
// It's a redirect : use redirected offsets
|
||||
filenumber = document.getElementById('redirectfilenumber').value;
|
||||
blockstart = document.getElementById('redirectblockstart').value;
|
||||
blockoffset = document.getElementById('redirectblockoffset').value;
|
||||
length = document.getElementById('redirectlength').value;
|
||||
if (!filenumber || filenumber=="") {
|
||||
// TODO : better handle this case
|
||||
alert("Redirect offsets not read yet");
|
||||
}
|
||||
}
|
||||
var dataFile = null;
|
||||
// Find the good dump file
|
||||
for (var i=0; i<dataFiles.length; i++) {
|
||||
var fileName = dataFiles[i].name;
|
||||
var prefixedFileNumber = "";
|
||||
if (filenumber<10) {
|
||||
prefixedFileNumber = "0"+filenumber;
|
||||
/**
|
||||
* Creates an instance of title from given titleId (including resolving redirects),
|
||||
* and call the function to read the corresponding article
|
||||
*/
|
||||
function findTitleFromTitleIdAndLaunchArticleRead(titleId) {
|
||||
$("#articleContent").html("Loading article from dump...");
|
||||
if (localArchive.dataFiles && localArchive.dataFiles.length>0) {
|
||||
var title = evopedia.Title.parseTitleId(localArchive,titleId);
|
||||
if (title.fileNr == 255) {
|
||||
localArchive.resolveRedirect(title, readArticle);
|
||||
}
|
||||
else {
|
||||
prefixedFileNumber = filenumber;
|
||||
readArticle(title);
|
||||
}
|
||||
var expectedFileName = "wikipedia_"+prefixedFileNumber+".dat";
|
||||
// Check if the fileName ends with the expected file name (in case of DeviceStorage usage, the fileName is prefixed by the directory)
|
||||
if (fileName.match(expectedFileName+"$") == expectedFileName) {
|
||||
dataFile = dataFiles[i];
|
||||
}
|
||||
}
|
||||
if (!dataFile) {
|
||||
alert("File number " + filenumber + " not found");
|
||||
document.getElementById("articleContent").innerHTML="";
|
||||
}
|
||||
else {
|
||||
readArticleFromOffset(dataFile, blockstart, blockoffset, length);
|
||||
alert("Data files not set");
|
||||
}
|
||||
}
|
||||
else {
|
||||
alert("Data files not set");
|
||||
|
||||
/**
|
||||
* Read the article corresponding to the given title
|
||||
*/
|
||||
function readArticle(title) {
|
||||
if ($.isArray(title)) {
|
||||
title = title[0];
|
||||
}
|
||||
localArchive.readArticle(title, displayArticleInForm);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read an article in a dump file, based on given offsets
|
||||
*/
|
||||
function readArticleFromOffset(dataFile, blockstart, blockoffset, length) {
|
||||
|
||||
var reader = new FileReader();
|
||||
reader.onerror = errorHandler;
|
||||
reader.onabort = function(e) {
|
||||
alert('Data file read cancelled');
|
||||
};
|
||||
reader.onload = function(e) {
|
||||
var compressedArticles = e.target.result;
|
||||
//var htmlArticle = ArchUtils.bz2.decode(compressedArticles);
|
||||
// TODO : should be improved by uncompressing the content chunk by chunk,
|
||||
// until the length is reached, instead of uncompressing everything
|
||||
var htmlArticles = bzip2.simple(bzip2.array(new Uint8Array(compressedArticles)));
|
||||
// Start reading at offset, and keep length characters
|
||||
var htmlArticle = htmlArticles.substring(blockoffset,blockoffset+length);
|
||||
// Keep only length characters
|
||||
htmlArticle = htmlArticle.substring(0,length);
|
||||
// Decode UTF-8 encoding
|
||||
htmlArticle = decodeURIComponent(escape(htmlArticle));
|
||||
|
||||
/**
|
||||
* Display the the given HTML article in the web page,
|
||||
* and convert links to javascript calls
|
||||
*/
|
||||
function displayArticleInForm(htmlArticle) {
|
||||
// Display the article inside the web page.
|
||||
$('#articleContent').html(htmlArticle);
|
||||
|
||||
|
||||
// Convert links into javascript calls
|
||||
$('#articleContent').find('a').each(function(){
|
||||
// Store current link's url
|
||||
var url = $(this).attr("href");
|
||||
|
||||
if(url.slice(0, 1) == "#") {
|
||||
// It's an anchor link : do nothing
|
||||
}
|
||||
else if (url.substring(0,4) === "http") {
|
||||
// It's an external link : do nothing
|
||||
}
|
||||
else {
|
||||
// It's a link to another article : add an onclick event to go to this article
|
||||
// instead of following the link
|
||||
$(this).on('click', function(e) {
|
||||
goToArticle($(this).attr("href"));
|
||||
return false;
|
||||
});
|
||||
}
|
||||
|
||||
});
|
||||
};
|
||||
|
||||
// TODO : should be improved by reading the file chunks by chunks until the article is found,
|
||||
// instead of reading the whole file starting at blockstart
|
||||
var blob = dataFile.slice(blockstart);
|
||||
|
||||
// Read in the image file as a binary string.
|
||||
reader.readAsArrayBuffer(blob);
|
||||
}
|
||||
|
||||
function errorHandler(evt) {
|
||||
switch(evt.target.error.code) {
|
||||
case evt.target.error.NOT_FOUND_ERR:
|
||||
alert('File Not Found!');
|
||||
break;
|
||||
case evt.target.error.NOT_READABLE_ERR:
|
||||
alert('File is not readable');
|
||||
break;
|
||||
case evt.target.error.ABORT_ERR:
|
||||
break; // noop
|
||||
default:
|
||||
alert('An error occurred reading this file.');
|
||||
};
|
||||
}
|
||||
|
||||
function handleDataFileSelect(evt) {
|
||||
dataFiles = evt.target.files;
|
||||
}
|
||||
|
||||
function handleTitleFileSelect(evt) {
|
||||
titleFile = evt.target.files[0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle Enter key in the prefix input zone
|
||||
*/
|
||||
function onKeyUpPrefix(evt) {
|
||||
if (evt.keyCode == 13) {
|
||||
document.getElementById("searchTitles").click();
|
||||
// Store current link's url
|
||||
var url = $(this).attr("href");
|
||||
|
||||
if(url.slice(0, 1) == "#") {
|
||||
// It's an anchor link : do nothing
|
||||
}
|
||||
else if (url.substring(0,4) === "http") {
|
||||
// It's an external link : do nothing
|
||||
}
|
||||
else {
|
||||
// It's a link to another article : add an onclick event to go to this article
|
||||
// instead of following the link
|
||||
$(this).on('click', function(e) {
|
||||
goToArticle($(this).attr("href"));
|
||||
return false;
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace article content with the one of the given title
|
||||
*/
|
||||
function goToArticle(title) {
|
||||
// This is awful and does not work very well.
|
||||
// It's just temporary before the algorithm is rewritten in an object-oriented way
|
||||
// TODO : rewrite this with a real article search and display
|
||||
searchTitlesFromPrefix(titleFile,title);
|
||||
updateOffsetsFromTitle($('#titleList').val());
|
||||
document.getElementById("articleContent").innerHTML="";
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace article content with the one of the given title
|
||||
*/
|
||||
function goToArticle(title) {
|
||||
$("#articleContent").html("Loading article from dump...");
|
||||
localArchive.getTitleByName(title, readArticle);
|
||||
}
|
||||
|
||||
});
|
||||
|
||||
|
@ -1,5 +1,9 @@
|
||||
define(function(require) {
|
||||
|
||||
// Module dependencies
|
||||
var remove_diacritics = require('remove_diacritics');
|
||||
var bzip2 = require('bzip2');
|
||||
|
||||
/**
|
||||
* Read an integer encoded in 4 bytes
|
||||
*/
|
||||
@ -37,11 +41,263 @@ define(function(require) {
|
||||
* It's still minimal for now. TODO : complete implementation to handle maths and coordinates
|
||||
*/
|
||||
function LocalArchive() {
|
||||
this.directory = null;
|
||||
this.dataFiles = new Array();
|
||||
this.titleFile = null;
|
||||
this.date = null;
|
||||
this.language = null;
|
||||
}
|
||||
// TODO to be replaced by the real archive attributes
|
||||
this.date = "2013-03-14";
|
||||
this.language = "zz";
|
||||
};
|
||||
|
||||
/**
|
||||
* This function is recursively called after each asynchronous read,
|
||||
* so that to find the closest index in titleFile to the given prefix
|
||||
* When found, call the callbackFunction with the index
|
||||
* @param reader
|
||||
* @param prefix
|
||||
* @param lo
|
||||
* @param hi
|
||||
* @param callbackFunction
|
||||
*/
|
||||
LocalArchive.prototype.recursivePrefixSearch = function(reader, prefix, lo, hi, callbackFunction) {
|
||||
if (lo < hi-1 ) {
|
||||
var mid = Math.round((lo+hi)/2);
|
||||
// TODO : improve the way we read this file : 128 bytes is arbitrary and might be too small
|
||||
var blob = this.titleFile.slice(mid,mid+128);
|
||||
var currentLocalArchiveInstance = this;
|
||||
reader.onload = function(e) {
|
||||
var binaryTitleFile = e.target.result;
|
||||
var byteArray = new Uint8Array(binaryTitleFile);
|
||||
// Look for the index of the next NewLine
|
||||
var newLineIndex=0;
|
||||
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=10) {
|
||||
newLineIndex++;
|
||||
}
|
||||
var i = newLineIndex+1;
|
||||
newLineIndex = i+15;
|
||||
// Look for the index of the next NewLine
|
||||
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=10) {
|
||||
newLineIndex++;
|
||||
}
|
||||
var title = utf8ByteArrayToString(byteArray,i+15,newLineIndex);
|
||||
if (title.localeCompare(prefix)<0) {
|
||||
lo = mid;
|
||||
}
|
||||
else {
|
||||
hi = mid;
|
||||
}
|
||||
currentLocalArchiveInstance.recursivePrefixSearch(reader, prefix, lo, hi, callbackFunction);
|
||||
};
|
||||
// Read the file as a binary string
|
||||
reader.readAsArrayBuffer(blob);
|
||||
}
|
||||
else {
|
||||
// We found the closest title at index lo
|
||||
callbackFunction(lo);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Look for a title in the title file at the given offset, and call the callbackFunction with this Title
|
||||
* @param titleOffset
|
||||
* @param callbackFunction
|
||||
*/
|
||||
LocalArchive.prototype.getTitleAtOffset = function(titleOffset, callbackFunction) {
|
||||
this.getTitlesStartingAtOffset(titleOffset, 1, callbackFunction);
|
||||
};
|
||||
|
||||
/**
|
||||
* Read the titles in the title file starting at the given offset (maximum titleCount), and call the callbackFunction with this list of Title instances
|
||||
* @param titleOffset
|
||||
* @param titleCount maximum number of titles to retrieve
|
||||
* @param callbackFunction
|
||||
*/
|
||||
LocalArchive.prototype.getTitlesStartingAtOffset = function(titleOffset, titleCount, callbackFunction) {
|
||||
var reader = new FileReader();
|
||||
reader.onerror = errorHandler;
|
||||
reader.onabort = function(e) {
|
||||
alert('Title file read cancelled');
|
||||
};
|
||||
|
||||
var currentLocalArchiveInstance = this;
|
||||
reader.onload = function(e) {
|
||||
var binaryTitleFile = e.target.result;
|
||||
var byteArray = new Uint8Array(binaryTitleFile);
|
||||
// Look for the index of the next NewLine
|
||||
var newLineIndex=0;
|
||||
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=10) {
|
||||
newLineIndex++;
|
||||
}
|
||||
var i = newLineIndex;
|
||||
var titleNumber=-1;
|
||||
var titleList = new Array();
|
||||
while (i<byteArray.length && titleNumber<titleCount) {
|
||||
// Look for the index of the next NewLine
|
||||
newLineIndex+=15;
|
||||
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=10) {
|
||||
newLineIndex++;
|
||||
}
|
||||
|
||||
// Copy the encodedTitle in a new Array
|
||||
var encodedTitle = new Uint8Array(newLineIndex-i);
|
||||
for (var j = 0; j < newLineIndex-i; j++) {
|
||||
encodedTitle[j] = byteArray[i+j];
|
||||
}
|
||||
|
||||
var title = Title.parseTitle(encodedTitle, currentLocalArchiveInstance, i);
|
||||
|
||||
// Skip the titles that do not start with the prefix
|
||||
// TODO use a normalizer to compare the strings
|
||||
// TODO see why we need to skip the first title
|
||||
//if (title && title.getReadableName().toLowerCase().indexOf(prefix.toLowerCase())==0) {
|
||||
if (titleNumber>=0) {
|
||||
titleList[titleNumber] = title;
|
||||
}
|
||||
titleNumber++;
|
||||
i=newLineIndex+1;
|
||||
}
|
||||
callbackFunction(titleList);
|
||||
};
|
||||
var blob = this.titleFile.slice(titleOffset);
|
||||
// Read in the file as a binary string
|
||||
reader.readAsArrayBuffer(blob);
|
||||
};
|
||||
|
||||
/**
|
||||
* Look for a title by its name, and call the callbackFunction with this Title
|
||||
* @param titleName
|
||||
* @param callbackFunction
|
||||
*/
|
||||
LocalArchive.prototype.getTitleByName = function(titleName, callbackFunction) {
|
||||
var titleFileSize = this.titleFile.size;
|
||||
var reader = new FileReader();
|
||||
reader.onerror = errorHandler;
|
||||
reader.onabort = function(e) {
|
||||
alert('Title file read cancelled');
|
||||
};
|
||||
var currentLocalArchiveInstance = this;
|
||||
this.recursivePrefixSearch(reader, titleName, 0, titleFileSize, function(titleOffset) {
|
||||
currentLocalArchiveInstance.getTitleAtOffset(titleOffset, callbackFunction);
|
||||
});
|
||||
};
|
||||
|
||||
/**
|
||||
* Get a random title, and call the callbackFunction with this Title
|
||||
* @param callbackFunction
|
||||
*/
|
||||
LocalArchive.prototype.getRandomTitle = function(callbackFunction) {
|
||||
// TODO to be implemented
|
||||
};
|
||||
|
||||
/**
|
||||
* Find the 50 titles that start with the given prefix, and call the callbackFunction with this list of Titles
|
||||
* @param prefix
|
||||
* @param callbackFunction
|
||||
*/
|
||||
LocalArchive.prototype.findTitlesWithPrefix = function(prefix, callbackFunction) {
|
||||
var titleFileSize = this.titleFile.size;
|
||||
if (prefix) {
|
||||
prefix = remove_diacritics.normalizeString(prefix);
|
||||
}
|
||||
|
||||
var reader = new FileReader();
|
||||
reader.onerror = errorHandler;
|
||||
reader.onabort = function(e) {
|
||||
alert('Title file read cancelled');
|
||||
};
|
||||
var currentLocalArchiveInstance = this;
|
||||
this.recursivePrefixSearch(reader, prefix, 0, titleFileSize, function(titleOffset) {
|
||||
currentLocalArchiveInstance.getTitlesStartingAtOffset(titleOffset, 50, callbackFunction);
|
||||
});
|
||||
};
|
||||
|
||||
/**
|
||||
* Read an article from the title instance, and call the callbackFunction with the article HTML String
|
||||
* @param title
|
||||
* @param callbackFunction
|
||||
*/
|
||||
LocalArchive.prototype.readArticle = function(title, callbackFunction) {
|
||||
var dataFile = null;
|
||||
|
||||
var prefixedFileNumber = "";
|
||||
if (title.fileNr<10) {
|
||||
prefixedFileNumber = "0" + title.fileNr;
|
||||
}
|
||||
else {
|
||||
prefixedFileNumber = title.fileNr;
|
||||
}
|
||||
var expectedFileName = "wikipedia_"+prefixedFileNumber+".dat";
|
||||
|
||||
// Find the good dump file
|
||||
for (var i=0; i<this.dataFiles.length; i++) {
|
||||
var fileName = this.dataFiles[i].name;
|
||||
// Check if the fileName ends with the expected file name (in case of DeviceStorage usage, the fileName is prefixed by the directory)
|
||||
if (fileName.match(expectedFileName+"$") == expectedFileName) {
|
||||
dataFile = this.dataFiles[i];
|
||||
}
|
||||
}
|
||||
if (!dataFile) {
|
||||
throw "File number " + title.fileNr + " not found";
|
||||
}
|
||||
else {
|
||||
var reader = new FileReader();
|
||||
reader.onerror = errorHandler;
|
||||
reader.onabort = function(e) {
|
||||
alert('Data file read cancelled');
|
||||
};
|
||||
reader.onload = function(e) {
|
||||
var compressedArticles = e.target.result;
|
||||
//var htmlArticle = ArchUtils.bz2.decode(compressedArticles);
|
||||
// TODO : should be improved by uncompressing the content chunk by chunk,
|
||||
// until the length is reached, instead of uncompressing everything
|
||||
var htmlArticles = bzip2.simple(bzip2.array(new Uint8Array(compressedArticles)));
|
||||
// Start reading at offset, and keep length characters
|
||||
var htmlArticle = htmlArticles.substring(title.blockOffset,title.blockOffset + title.articleLength);
|
||||
// Keep only length characters
|
||||
htmlArticle = htmlArticle.substring(0,title.articleLength);
|
||||
// Decode UTF-8 encoding
|
||||
htmlArticle = decodeURIComponent(escape(htmlArticle));
|
||||
|
||||
callbackFunction (htmlArticle);
|
||||
};
|
||||
|
||||
// TODO : should be improved by reading the file chunks by chunks until the article is found,
|
||||
// instead of reading the whole file starting at blockstart
|
||||
var blob = dataFile.slice(title.blockStart);
|
||||
|
||||
// Read in the image file as a binary string.
|
||||
reader.readAsArrayBuffer(blob);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
/**
|
||||
* Resolve the redirect of the given title instance, and call the callbackFunction with the redirected Title instance
|
||||
* @param title
|
||||
* @param callbackFunction
|
||||
*/
|
||||
LocalArchive.prototype.resolveRedirect = function(title, callbackFunction) {
|
||||
var reader = new FileReader();
|
||||
reader.onerror = errorHandler;
|
||||
reader.onabort = function(e) {
|
||||
alert('Title file read cancelled');
|
||||
};
|
||||
reader.onload = function(e) {
|
||||
var binaryTitleFile = e.target.result;
|
||||
var byteArray = new Uint8Array(binaryTitleFile);
|
||||
|
||||
var redirectedTitle = title;
|
||||
redirectedTitle.fileNr = byteArray[2];
|
||||
redirectedTitle.blockStart = readIntegerFrom4Bytes(byteArray,3);
|
||||
redirectedTitle.blockOffset = readIntegerFrom4Bytes(byteArray,7);
|
||||
redirectedTitle.articleLength = readIntegerFrom4Bytes(byteArray,11);
|
||||
|
||||
callbackFunction(redirectedTitle);
|
||||
};
|
||||
// Read only the 16 necessary bytes, starting at title.blockStart
|
||||
var blob = titleFile.slice(title.blockStart,title.blockStart+16);
|
||||
// Read in the file as a binary string
|
||||
reader.readAsArrayBuffer(blob);
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
@ -125,13 +381,57 @@ define(function(require) {
|
||||
return utf8ByteArrayToString(encodedTitle, 15, len);
|
||||
};
|
||||
|
||||
Title.prototype.toStringId = function(){
|
||||
return this.archive.language + "_" + this.archive.date + "_" + this.titleOffset;
|
||||
/**
|
||||
* Creates a title instance from a serialized id
|
||||
*/
|
||||
Title.parseTitleId = function(localArchive, titleId) {
|
||||
var title = new Title();
|
||||
var idParts = titleId.split("|");
|
||||
title.archive = localArchive;
|
||||
title.fileNr = idParts[2];
|
||||
title.titleOffset = idParts[3];
|
||||
title.name = idParts[4];
|
||||
title.blockStart = idParts[5];
|
||||
title.blockOffset = idParts[6];
|
||||
title.articleLength = idParts[7];
|
||||
return title;
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Serialize the title with its values
|
||||
* @returns {String}
|
||||
*/
|
||||
Title.prototype.toStringId = function(){
|
||||
return this.archive.language + "|" + this.archive.date + "|" + this.fileNr + "|"
|
||||
+ this.titleOffset + "|" + this.name + "|" + this.blockStart + "|" + this.blockOffset + "|" + this.articleLength ;
|
||||
};
|
||||
|
||||
/**
|
||||
* Serialize the title in a readable way
|
||||
*/
|
||||
Title.prototype.toString = function(){
|
||||
return "title.id = " + this.toStringId() + "title.name = " + this.name + " title.fileNr = " + this.fileNr + " title.blockStart = " + this.blockStart + " title.blockOffset = " + this.blockOffset + " title.articleLength = " + this.articleLength;
|
||||
};
|
||||
|
||||
/**
|
||||
* ErrorHandler for FileReader
|
||||
*/
|
||||
function errorHandler(evt) {
|
||||
switch(evt.target.error.code) {
|
||||
case evt.target.error.NOT_FOUND_ERR:
|
||||
alert('File Not Found!');
|
||||
break;
|
||||
case evt.target.error.NOT_READABLE_ERR:
|
||||
alert('File is not readable');
|
||||
break;
|
||||
case evt.target.error.ABORT_ERR:
|
||||
break; // noop
|
||||
default:
|
||||
alert('An error occurred reading this file.');
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Functions and classes exposed by this module
|
||||
*/
|
||||
|
Loading…
x
Reference in New Issue
Block a user