diff --git a/www/index.html b/www/index.html
index 8ee57d1f..6bf8e1fa 100644
--- a/www/index.html
+++ b/www/index.html
@@ -48,7 +48,7 @@ License:
To use it, you have to first download locally a dump from http://dumpathome.evopedia.info/dumps/finished (with a Bittorrent client), and select some of the dowloaded files below.
- Current status : I have tested it with the small dump (2010-08-14), the French dump (2012-02-03) and the French wiktionary dump (2011-03-16). It does NOT work on the English dump for now (I'm working on it)
+ Current status : I have tested it with the small dump (2010-08-14), the French dump (2012-02-03), the French wiktionary dump (2011-03-16) and the English dump (2012-02-11)
diff --git a/www/js/app.js b/www/js/app.js
index 269830e0..a2d6f753 100644
--- a/www/js/app.js
+++ b/www/js/app.js
@@ -20,6 +20,7 @@ define(function(require) {
// Evopedia javascript dependencies
var bzip2 = require('bzip2');
var remove_diacritics = require('remove_diacritics');
+ var evopedia = require('evopedia');
var dataFiles=document.getElementById('dataFiles').files;
@@ -126,38 +127,6 @@ function updateOffsetsFromTitle(selectValue) {
}
}
-/**
- * Read an integer encoded in 4 bytes
- */
-function readIntegerFrom4Bytes(byteArray,firstIndex) {
- return byteArray[firstIndex] + byteArray[firstIndex+1]*256 + byteArray[firstIndex+2]*65536 + byteArray[firstIndex+3]*16777216;
-}
-
-/**
- * Converts a UTF-8 byte array to JavaScript's 16-bit Unicode.
- * @param {Array.} bytes UTF-8 byte array.
- * @return {string} 16-bit Unicode string.
- * Copied from http://closure-library.googlecode.com/svn/docs/closure_goog_crypt.js.source.html (Apache License 2.0)
- */
-function utf8ByteArrayToString(bytes,startIndex,endIndex) {
- var out = [], pos = startIndex, c = 0;
- while (pos < bytes.length && pos < endIndex) {
- var c1 = bytes[pos++];
- if (c1 < 128) {
- out[c++] = String.fromCharCode(c1);
- } else if (c1 > 191 && c1 < 224) {
- var c2 = bytes[pos++];
- out[c++] = String.fromCharCode((c1 & 31) << 6 | c2 & 63);
- } else {
- var c2 = bytes[pos++];
- var c3 = bytes[pos++];
- out[c++] = String.fromCharCode(
- (c1 & 15) << 12 | (c2 & 63) << 6 | c3 & 63);
- }
- }
- return out.join('');
-};
-
/**
* This function is recursively called after each asynchronous read,
* so that to find the closest index in titleFile to the given prefix
@@ -172,16 +141,16 @@ function recursivePrefixSearch(titleFile, reader, prefix, lo, hi) {
var byteArray = new Uint8Array(binaryTitleFile);
// Look for the index of the next NewLine
var newLineIndex=0;
- while (newLineIndex=0 && title) {
- debug("Found title : escape1="+escape1+" escape2="+escape2+" filenumber="+filenumber+" blockstart="+blockstart+" blockoffset="+blockoffset+" length="+length+" title="+title);
// TODO : check if the title starts with prefix, and return if it does not
- comboTitleList.options[titleNumber] = new Option (title, filenumber+"|"+blockstart+"|"+blockoffset+"|"+length);
+ comboTitleList.options[titleNumber] = new Option (title.name, title.fileNr + "|" + title.blockStart + "|" + title.blockOffset + "|" + title.articleLength);
+ debug("Title : startIndex = " + i + " endIndex = " + newLineIndex + " title.name = " + title.name + " title.fileNr = " + title.fileNr + " title.blockStart = " + title.blockStart + " title.blockOffset = " + title.blockOffset + " title.articleLength = " + title.articleLength);
}
titleNumber++;
- i=newLineIndex-1;
+ i=newLineIndex+1;
}
// Update the offsets, as if the first item of the list was selected by the user
updateOffsetsFromTitle($('#titleList').val());
diff --git a/www/js/lib/bzip2.js b/www/js/lib/bzip2.js
index 0388158c..8322cf27 100644
--- a/www/js/lib/bzip2.js
+++ b/www/js/lib/bzip2.js
@@ -253,9 +253,9 @@ bzip2.decompress = function(bits, size, len){
return {
- array: function(bytes) { return bzip2.array(bytes);},
- simple: function(bits) { return bzip2.simple(bits);},
- header: function(bits) { return bzip2.header(bits);},
- decompress: function(bits, size, len) { return bzip2.decompress(bits, size, len);}
+ array: bzip2.array,
+ simple: bzip2.simple,
+ header: bzip2.header,
+ decompress: bzip2.decompress
};
});
diff --git a/www/js/lib/evopedia.js b/www/js/lib/evopedia.js
new file mode 100644
index 00000000..37653918
--- /dev/null
+++ b/www/js/lib/evopedia.js
@@ -0,0 +1,131 @@
+define(function(require) {
+
+ /**
+ * Read an integer encoded in 4 bytes
+ */
+ function readIntegerFrom4Bytes(byteArray,firstIndex) {
+ return byteArray[firstIndex] + byteArray[firstIndex+1]*256 + byteArray[firstIndex+2]*65536 + byteArray[firstIndex+3]*16777216;
+ }
+
+ /**
+ * Converts a UTF-8 byte array to JavaScript's 16-bit Unicode.
+ * @param {Array.} bytes UTF-8 byte array.
+ * @return {string} 16-bit Unicode string.
+ * Copied from http://closure-library.googlecode.com/svn/docs/closure_goog_crypt.js.source.html (Apache License 2.0)
+ */
+ function utf8ByteArrayToString(bytes,startIndex,endIndex) {
+ var out = [], pos = startIndex, c = 0;
+ while (pos < bytes.length && pos < endIndex) {
+ var c1 = bytes[pos++];
+ if (c1 < 128) {
+ out[c++] = String.fromCharCode(c1);
+ } else if (c1 > 191 && c1 < 224) {
+ var c2 = bytes[pos++];
+ out[c++] = String.fromCharCode((c1 & 31) << 6 | c2 & 63);
+ } else {
+ var c2 = bytes[pos++];
+ var c3 = bytes[pos++];
+ out[c++] = String.fromCharCode(
+ (c1 & 15) << 12 | (c2 & 63) << 6 | c3 & 63);
+ }
+ }
+ return out.join('');
+ }
+
+ /**
+ * LocalArchive class : defines a wikipedia dump on the filesystem
+ * It's still minimal for now. TODO : complete implementation to handle maths and coordinates
+ */
+ function LocalArchive() {
+ this.directory = null;
+ this.titleFile = null;
+ }
+
+
+ /**
+ * Title class : defines the title of an article and some methods to manipulate it
+ */
+ function Title() {
+ this.name = null;
+ this.fileNr = null;
+ this.blockStart = null;
+ this.blockOffset = null;
+ this.articleLength = null;
+ this.archive = null;
+ this.titleOffset = null;
+ this.titleEntryLength = null;
+ };
+
+
+ /**
+ * Creates a Title instance from an encoded title line from a title file
+ */
+ Title.parseTitle = function(encodedTitle, archive, titleOffset) {
+ if (archive == null) {
+ throw "archive cannot be null";
+ }
+ if (titleOffset < 0) {
+ throw "titleOffset cannot be negative (was " + titleOffset + ")";
+ }
+ var t = new Title();
+ t.archive = archive;
+ t.titleOffset = titleOffset;
+
+ if (encodedTitle == null || encodedTitle.length < 15)
+ return null;
+
+ if (encodedTitle[encodedTitle.length - 1] == '\n') {
+ t.titleEntryLength = encodedTitle.length;
+ } else {
+ t.titleEntryLength = encodedTitle.length + 1;
+ }
+
+ // TODO : handle escapes
+ /*
+ int escapes = LittleEndianReader.readUInt16(encodedTitle, 0);
+ byte[] positionData = new byte[13];
+ System.arraycopy(encodedTitle, 2, positionData, 0, 13);
+
+ if ((escapes & (1 << 14)) != 0)
+ escapes |= '\n';
+
+ for (int i = 0; i < 13; i ++) {
+ if ((escapes & (1 << i)) != 0)
+ positionData[i] = '\n';
+ }
+ */
+
+ t.fileNr = encodedTitle[2];
+ t.blockStart = readIntegerFrom4Bytes(encodedTitle, 3);
+ t.blockOffset = readIntegerFrom4Bytes(encodedTitle, 7);
+ t.articleLength = readIntegerFrom4Bytes(encodedTitle, 11);
+
+ t.name = Title.parseNameOnly(encodedTitle);
+
+ return t;
+ };
+
+ /*
+ * Retrieves the name of an article from an encoded title line
+ */
+ Title.parseNameOnly = function(encodedTitle) {
+ var len = encodedTitle.length;
+ if (len < 15) {
+ return null;
+ }
+ if (len > 15 && encodedTitle[len - 1] == '\n') {
+ len--;
+ }
+ return utf8ByteArrayToString(encodedTitle, 15, len);
+ };
+
+ /**
+ * Functions and classes exposed by this module
+ */
+ return {
+ readIntegerFrom4Bytes: readIntegerFrom4Bytes,
+ utf8ByteArrayToString : utf8ByteArrayToString,
+ LocalArchive : LocalArchive,
+ Title : Title
+ };
+});
\ No newline at end of file
diff --git a/www/js/lib/remove_diacritics.js b/www/js/lib/remove_diacritics.js
index 8314888b..a9ab56b2 100644
--- a/www/js/lib/remove_diacritics.js
+++ b/www/js/lib/remove_diacritics.js
@@ -102,8 +102,6 @@ function normalizeString(string) {
return {
- normalizeString: function(string) {
- return normalizeString(string);
- }
+ normalizeString: normalizeString
};
});