Started refactoring the code in a more object-oriented way

Fixed the algorithm on the English dump
This commit is contained in:
mossroy 2013-03-14 17:35:45 +01:00
parent 5260b75587
commit 66e42247ff
5 changed files with 161 additions and 70 deletions

View File

@ -48,7 +48,7 @@ License:
<br />
To use it, you have to first download locally a dump from <a href="http://dumpathome.evopedia.info/dumps/finished">http://dumpathome.evopedia.info/dumps/finished</a> (with a Bittorrent client), and select some of the dowloaded files below.
<br />
Current status : I have tested it with the <a href="http://evopedia.info/dumps/wikipedia_small_2010-08-14.torrent">small dump (2010-08-14)</a>, the <a href="http://evopedia.info/dumps/wikipedia_fr_2012-02-03.torrent">French dump (2012-02-03)</a> and the <a href="http://evopedia.info/dumps/wikipedia_frwiktionary_2011-03-16.torrent">French wiktionary dump (2011-03-16)</a>. <b>It does NOT work on the English dump for now</b> (I'm working on it)
Current status : I have tested it with the <a href="http://evopedia.info/dumps/wikipedia_small_2010-08-14.torrent">small dump (2010-08-14)</a>, the <a href="http://evopedia.info/dumps/wikipedia_fr_2012-02-03.torrent">French dump (2012-02-03)</a>, the <a href="http://evopedia.info/dumps/wikipedia_frwiktionary_2011-03-16.torrent">French wiktionary dump (2011-03-16)</a> and the <a href="http://evopedia.info/dumps/wikipedia_en_2012-02-11.torrent">English dump (2012-02-11)</a>
<br />
<br />
<ul>

View File

@ -20,6 +20,7 @@ define(function(require) {
// Evopedia javascript dependencies
var bzip2 = require('bzip2');
var remove_diacritics = require('remove_diacritics');
var evopedia = require('evopedia');
var dataFiles=document.getElementById('dataFiles').files;
@ -126,38 +127,6 @@ function updateOffsetsFromTitle(selectValue) {
}
}
/**
* Read an integer encoded in 4 bytes
*/
function readIntegerFrom4Bytes(byteArray,firstIndex) {
return byteArray[firstIndex] + byteArray[firstIndex+1]*256 + byteArray[firstIndex+2]*65536 + byteArray[firstIndex+3]*16777216;
}
/**
* Converts a UTF-8 byte array to JavaScript's 16-bit Unicode.
* @param {Array.<number>} bytes UTF-8 byte array.
* @return {string} 16-bit Unicode string.
* Copied from http://closure-library.googlecode.com/svn/docs/closure_goog_crypt.js.source.html (Apache License 2.0)
*/
function utf8ByteArrayToString(bytes,startIndex,endIndex) {
var out = [], pos = startIndex, c = 0;
while (pos < bytes.length && pos < endIndex) {
var c1 = bytes[pos++];
if (c1 < 128) {
out[c++] = String.fromCharCode(c1);
} else if (c1 > 191 && c1 < 224) {
var c2 = bytes[pos++];
out[c++] = String.fromCharCode((c1 & 31) << 6 | c2 & 63);
} else {
var c2 = bytes[pos++];
var c3 = bytes[pos++];
out[c++] = String.fromCharCode(
(c1 & 15) << 12 | (c2 & 63) << 6 | c3 & 63);
}
}
return out.join('');
};
/**
* This function is recursively called after each asynchronous read,
* so that to find the closest index in titleFile to the given prefix
@ -172,16 +141,16 @@ function recursivePrefixSearch(titleFile, reader, prefix, lo, hi) {
var byteArray = new Uint8Array(binaryTitleFile);
// Look for the index of the next NewLine
var newLineIndex=0;
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=128) {
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=10) {
newLineIndex++;
}
var i = newLineIndex-1;
var i = newLineIndex+1;
newLineIndex = i+15;
// Look for the index of the next NewLine
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=128) {
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=10) {
newLineIndex++;
}
var title = utf8ByteArrayToString(byteArray,i+15,newLineIndex);
var title = evopedia.utf8ByteArrayToString(byteArray,i+15,newLineIndex);
debug("title found : "+title);
if (title.localeCompare(prefix)<0) {
lo = mid;
@ -240,9 +209,9 @@ function readRedirectOffsets(titleFile,redirectIndex) {
var byteArray = new Uint8Array(binaryTitleFile);
var filenumber = byteArray[2];
var blockstart = readIntegerFrom4Bytes(byteArray,3);
var blockoffset = readIntegerFrom4Bytes(byteArray,7);
var length = readIntegerFrom4Bytes(byteArray,11);
var blockstart = evopedia.readIntegerFrom4Bytes(byteArray,3);
var blockoffset = evopedia.readIntegerFrom4Bytes(byteArray,7);
var length = evopedia.readIntegerFrom4Bytes(byteArray,11);
document.getElementById('redirectfilenumber').value = filenumber;
document.getElementById('redirectblockstart').value = blockstart;
@ -272,42 +241,35 @@ function readTitlesBeginningAtIndexStartingWithPrefix(titleFile,prefix,startInde
var byteArray = new Uint8Array(binaryTitleFile);
// Look for the index of the next NewLine
var newLineIndex=0;
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=128) {
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=10) {
newLineIndex++;
}
var i = newLineIndex;
var titleNumber=-1;
var comboTitleList = document.getElementById('titleList');
while (i<byteArray.length && titleNumber<50) {
var filenumber = 0;
var blockstart = 0;
var blockoffset = 0;
var length = 0;
var title = "";
// TODO : interpret escape area
var escape1 = byteArray[i];
var escape2 = byteArray[i+1];
filenumber = byteArray[i+2];
blockstart = readIntegerFrom4Bytes(byteArray,i+3);
blockoffset = readIntegerFrom4Bytes(byteArray,i+7);
length = readIntegerFrom4Bytes(byteArray,i+11);
var newLineIndex = i+15;
// Look for the index of the next NewLine
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=128) {
// Look for the index of the next NewLine
newLineIndex+=15;
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=10) {
newLineIndex++;
}
title = utf8ByteArrayToString(byteArray,i+15,newLineIndex);
// Copy the encodedTitle in a new Array
var encodedTitle = new Uint8Array(newLineIndex-i);
for (var j = 0; j < newLineIndex-i; j++) {
encodedTitle[j] = byteArray[i+j];
}
var title = evopedia.Title.parseTitle(encodedTitle, new evopedia.LocalArchive(), i);
// Skip the first title
if (titleNumber>=0 && title) {
debug("Found title : escape1="+escape1+" escape2="+escape2+" filenumber="+filenumber+" blockstart="+blockstart+" blockoffset="+blockoffset+" length="+length+" title="+title);
// TODO : check if the title starts with prefix, and return if it does not
comboTitleList.options[titleNumber] = new Option (title, filenumber+"|"+blockstart+"|"+blockoffset+"|"+length);
comboTitleList.options[titleNumber] = new Option (title.name, title.fileNr + "|" + title.blockStart + "|" + title.blockOffset + "|" + title.articleLength);
debug("Title : startIndex = " + i + " endIndex = " + newLineIndex + " title.name = " + title.name + " title.fileNr = " + title.fileNr + " title.blockStart = " + title.blockStart + " title.blockOffset = " + title.blockOffset + " title.articleLength = " + title.articleLength);
}
titleNumber++;
i=newLineIndex-1;
i=newLineIndex+1;
}
// Update the offsets, as if the first item of the list was selected by the user
updateOffsetsFromTitle($('#titleList').val());

View File

@ -253,9 +253,9 @@ bzip2.decompress = function(bits, size, len){
return {
array: function(bytes) { return bzip2.array(bytes);},
simple: function(bits) { return bzip2.simple(bits);},
header: function(bits) { return bzip2.header(bits);},
decompress: function(bits, size, len) { return bzip2.decompress(bits, size, len);}
array: bzip2.array,
simple: bzip2.simple,
header: bzip2.header,
decompress: bzip2.decompress
};
});

131
www/js/lib/evopedia.js Normal file
View File

@ -0,0 +1,131 @@
define(function(require) {
/**
* Read an integer encoded in 4 bytes
*/
function readIntegerFrom4Bytes(byteArray,firstIndex) {
return byteArray[firstIndex] + byteArray[firstIndex+1]*256 + byteArray[firstIndex+2]*65536 + byteArray[firstIndex+3]*16777216;
}
/**
* Converts a UTF-8 byte array to JavaScript's 16-bit Unicode.
* @param {Array.<number>} bytes UTF-8 byte array.
* @return {string} 16-bit Unicode string.
* Copied from http://closure-library.googlecode.com/svn/docs/closure_goog_crypt.js.source.html (Apache License 2.0)
*/
function utf8ByteArrayToString(bytes,startIndex,endIndex) {
var out = [], pos = startIndex, c = 0;
while (pos < bytes.length && pos < endIndex) {
var c1 = bytes[pos++];
if (c1 < 128) {
out[c++] = String.fromCharCode(c1);
} else if (c1 > 191 && c1 < 224) {
var c2 = bytes[pos++];
out[c++] = String.fromCharCode((c1 & 31) << 6 | c2 & 63);
} else {
var c2 = bytes[pos++];
var c3 = bytes[pos++];
out[c++] = String.fromCharCode(
(c1 & 15) << 12 | (c2 & 63) << 6 | c3 & 63);
}
}
return out.join('');
}
/**
* LocalArchive class : defines a wikipedia dump on the filesystem
* It's still minimal for now. TODO : complete implementation to handle maths and coordinates
*/
function LocalArchive() {
this.directory = null;
this.titleFile = null;
}
/**
* Title class : defines the title of an article and some methods to manipulate it
*/
function Title() {
this.name = null;
this.fileNr = null;
this.blockStart = null;
this.blockOffset = null;
this.articleLength = null;
this.archive = null;
this.titleOffset = null;
this.titleEntryLength = null;
};
/**
* Creates a Title instance from an encoded title line from a title file
*/
Title.parseTitle = function(encodedTitle, archive, titleOffset) {
if (archive == null) {
throw "archive cannot be null";
}
if (titleOffset < 0) {
throw "titleOffset cannot be negative (was " + titleOffset + ")";
}
var t = new Title();
t.archive = archive;
t.titleOffset = titleOffset;
if (encodedTitle == null || encodedTitle.length < 15)
return null;
if (encodedTitle[encodedTitle.length - 1] == '\n') {
t.titleEntryLength = encodedTitle.length;
} else {
t.titleEntryLength = encodedTitle.length + 1;
}
// TODO : handle escapes
/*
int escapes = LittleEndianReader.readUInt16(encodedTitle, 0);
byte[] positionData = new byte[13];
System.arraycopy(encodedTitle, 2, positionData, 0, 13);
if ((escapes & (1 << 14)) != 0)
escapes |= '\n';
for (int i = 0; i < 13; i ++) {
if ((escapes & (1 << i)) != 0)
positionData[i] = '\n';
}
*/
t.fileNr = encodedTitle[2];
t.blockStart = readIntegerFrom4Bytes(encodedTitle, 3);
t.blockOffset = readIntegerFrom4Bytes(encodedTitle, 7);
t.articleLength = readIntegerFrom4Bytes(encodedTitle, 11);
t.name = Title.parseNameOnly(encodedTitle);
return t;
};
/*
* Retrieves the name of an article from an encoded title line
*/
Title.parseNameOnly = function(encodedTitle) {
var len = encodedTitle.length;
if (len < 15) {
return null;
}
if (len > 15 && encodedTitle[len - 1] == '\n') {
len--;
}
return utf8ByteArrayToString(encodedTitle, 15, len);
};
/**
* Functions and classes exposed by this module
*/
return {
readIntegerFrom4Bytes: readIntegerFrom4Bytes,
utf8ByteArrayToString : utf8ByteArrayToString,
LocalArchive : LocalArchive,
Title : Title
};
});

View File

@ -102,8 +102,6 @@ function normalizeString(string) {
return {
normalizeString: function(string) {
return normalizeString(string);
}
normalizeString: normalizeString
};
});