mirror of
https://github.com/kiwix/kiwix-js.git
synced 2025-09-24 04:54:51 -04:00
Started refactoring the code in a more object-oriented way
Fixed the algorithm on the English dump
This commit is contained in:
parent
5260b75587
commit
66e42247ff
@ -48,7 +48,7 @@ License:
|
|||||||
<br />
|
<br />
|
||||||
To use it, you have to first download locally a dump from <a href="http://dumpathome.evopedia.info/dumps/finished">http://dumpathome.evopedia.info/dumps/finished</a> (with a Bittorrent client), and select some of the dowloaded files below.
|
To use it, you have to first download locally a dump from <a href="http://dumpathome.evopedia.info/dumps/finished">http://dumpathome.evopedia.info/dumps/finished</a> (with a Bittorrent client), and select some of the dowloaded files below.
|
||||||
<br />
|
<br />
|
||||||
Current status : I have tested it with the <a href="http://evopedia.info/dumps/wikipedia_small_2010-08-14.torrent">small dump (2010-08-14)</a>, the <a href="http://evopedia.info/dumps/wikipedia_fr_2012-02-03.torrent">French dump (2012-02-03)</a> and the <a href="http://evopedia.info/dumps/wikipedia_frwiktionary_2011-03-16.torrent">French wiktionary dump (2011-03-16)</a>. <b>It does NOT work on the English dump for now</b> (I'm working on it)
|
Current status : I have tested it with the <a href="http://evopedia.info/dumps/wikipedia_small_2010-08-14.torrent">small dump (2010-08-14)</a>, the <a href="http://evopedia.info/dumps/wikipedia_fr_2012-02-03.torrent">French dump (2012-02-03)</a>, the <a href="http://evopedia.info/dumps/wikipedia_frwiktionary_2011-03-16.torrent">French wiktionary dump (2011-03-16)</a> and the <a href="http://evopedia.info/dumps/wikipedia_en_2012-02-11.torrent">English dump (2012-02-11)</a>
|
||||||
<br />
|
<br />
|
||||||
<br />
|
<br />
|
||||||
<ul>
|
<ul>
|
||||||
|
@ -20,6 +20,7 @@ define(function(require) {
|
|||||||
// Evopedia javascript dependencies
|
// Evopedia javascript dependencies
|
||||||
var bzip2 = require('bzip2');
|
var bzip2 = require('bzip2');
|
||||||
var remove_diacritics = require('remove_diacritics');
|
var remove_diacritics = require('remove_diacritics');
|
||||||
|
var evopedia = require('evopedia');
|
||||||
|
|
||||||
|
|
||||||
var dataFiles=document.getElementById('dataFiles').files;
|
var dataFiles=document.getElementById('dataFiles').files;
|
||||||
@ -126,38 +127,6 @@ function updateOffsetsFromTitle(selectValue) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Read an integer encoded in 4 bytes
|
|
||||||
*/
|
|
||||||
function readIntegerFrom4Bytes(byteArray,firstIndex) {
|
|
||||||
return byteArray[firstIndex] + byteArray[firstIndex+1]*256 + byteArray[firstIndex+2]*65536 + byteArray[firstIndex+3]*16777216;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Converts a UTF-8 byte array to JavaScript's 16-bit Unicode.
|
|
||||||
* @param {Array.<number>} bytes UTF-8 byte array.
|
|
||||||
* @return {string} 16-bit Unicode string.
|
|
||||||
* Copied from http://closure-library.googlecode.com/svn/docs/closure_goog_crypt.js.source.html (Apache License 2.0)
|
|
||||||
*/
|
|
||||||
function utf8ByteArrayToString(bytes,startIndex,endIndex) {
|
|
||||||
var out = [], pos = startIndex, c = 0;
|
|
||||||
while (pos < bytes.length && pos < endIndex) {
|
|
||||||
var c1 = bytes[pos++];
|
|
||||||
if (c1 < 128) {
|
|
||||||
out[c++] = String.fromCharCode(c1);
|
|
||||||
} else if (c1 > 191 && c1 < 224) {
|
|
||||||
var c2 = bytes[pos++];
|
|
||||||
out[c++] = String.fromCharCode((c1 & 31) << 6 | c2 & 63);
|
|
||||||
} else {
|
|
||||||
var c2 = bytes[pos++];
|
|
||||||
var c3 = bytes[pos++];
|
|
||||||
out[c++] = String.fromCharCode(
|
|
||||||
(c1 & 15) << 12 | (c2 & 63) << 6 | c3 & 63);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return out.join('');
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This function is recursively called after each asynchronous read,
|
* This function is recursively called after each asynchronous read,
|
||||||
* so that to find the closest index in titleFile to the given prefix
|
* so that to find the closest index in titleFile to the given prefix
|
||||||
@ -172,16 +141,16 @@ function recursivePrefixSearch(titleFile, reader, prefix, lo, hi) {
|
|||||||
var byteArray = new Uint8Array(binaryTitleFile);
|
var byteArray = new Uint8Array(binaryTitleFile);
|
||||||
// Look for the index of the next NewLine
|
// Look for the index of the next NewLine
|
||||||
var newLineIndex=0;
|
var newLineIndex=0;
|
||||||
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=128) {
|
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=10) {
|
||||||
newLineIndex++;
|
newLineIndex++;
|
||||||
}
|
}
|
||||||
var i = newLineIndex-1;
|
var i = newLineIndex+1;
|
||||||
newLineIndex = i+15;
|
newLineIndex = i+15;
|
||||||
// Look for the index of the next NewLine
|
// Look for the index of the next NewLine
|
||||||
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=128) {
|
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=10) {
|
||||||
newLineIndex++;
|
newLineIndex++;
|
||||||
}
|
}
|
||||||
var title = utf8ByteArrayToString(byteArray,i+15,newLineIndex);
|
var title = evopedia.utf8ByteArrayToString(byteArray,i+15,newLineIndex);
|
||||||
debug("title found : "+title);
|
debug("title found : "+title);
|
||||||
if (title.localeCompare(prefix)<0) {
|
if (title.localeCompare(prefix)<0) {
|
||||||
lo = mid;
|
lo = mid;
|
||||||
@ -240,9 +209,9 @@ function readRedirectOffsets(titleFile,redirectIndex) {
|
|||||||
var byteArray = new Uint8Array(binaryTitleFile);
|
var byteArray = new Uint8Array(binaryTitleFile);
|
||||||
var filenumber = byteArray[2];
|
var filenumber = byteArray[2];
|
||||||
|
|
||||||
var blockstart = readIntegerFrom4Bytes(byteArray,3);
|
var blockstart = evopedia.readIntegerFrom4Bytes(byteArray,3);
|
||||||
var blockoffset = readIntegerFrom4Bytes(byteArray,7);
|
var blockoffset = evopedia.readIntegerFrom4Bytes(byteArray,7);
|
||||||
var length = readIntegerFrom4Bytes(byteArray,11);
|
var length = evopedia.readIntegerFrom4Bytes(byteArray,11);
|
||||||
|
|
||||||
document.getElementById('redirectfilenumber').value = filenumber;
|
document.getElementById('redirectfilenumber').value = filenumber;
|
||||||
document.getElementById('redirectblockstart').value = blockstart;
|
document.getElementById('redirectblockstart').value = blockstart;
|
||||||
@ -272,42 +241,35 @@ function readTitlesBeginningAtIndexStartingWithPrefix(titleFile,prefix,startInde
|
|||||||
var byteArray = new Uint8Array(binaryTitleFile);
|
var byteArray = new Uint8Array(binaryTitleFile);
|
||||||
// Look for the index of the next NewLine
|
// Look for the index of the next NewLine
|
||||||
var newLineIndex=0;
|
var newLineIndex=0;
|
||||||
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=128) {
|
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=10) {
|
||||||
newLineIndex++;
|
newLineIndex++;
|
||||||
}
|
}
|
||||||
var i = newLineIndex;
|
var i = newLineIndex;
|
||||||
var titleNumber=-1;
|
var titleNumber=-1;
|
||||||
var comboTitleList = document.getElementById('titleList');
|
var comboTitleList = document.getElementById('titleList');
|
||||||
while (i<byteArray.length && titleNumber<50) {
|
while (i<byteArray.length && titleNumber<50) {
|
||||||
var filenumber = 0;
|
// Look for the index of the next NewLine
|
||||||
var blockstart = 0;
|
newLineIndex+=15;
|
||||||
var blockoffset = 0;
|
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=10) {
|
||||||
var length = 0;
|
|
||||||
var title = "";
|
|
||||||
|
|
||||||
// TODO : interpret escape area
|
|
||||||
var escape1 = byteArray[i];
|
|
||||||
var escape2 = byteArray[i+1];
|
|
||||||
filenumber = byteArray[i+2];
|
|
||||||
|
|
||||||
blockstart = readIntegerFrom4Bytes(byteArray,i+3);
|
|
||||||
blockoffset = readIntegerFrom4Bytes(byteArray,i+7);
|
|
||||||
length = readIntegerFrom4Bytes(byteArray,i+11);
|
|
||||||
var newLineIndex = i+15;
|
|
||||||
|
|
||||||
// Look for the index of the next NewLine
|
|
||||||
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=128) {
|
|
||||||
newLineIndex++;
|
newLineIndex++;
|
||||||
}
|
}
|
||||||
title = utf8ByteArrayToString(byteArray,i+15,newLineIndex);
|
|
||||||
|
// Copy the encodedTitle in a new Array
|
||||||
|
var encodedTitle = new Uint8Array(newLineIndex-i);
|
||||||
|
for (var j = 0; j < newLineIndex-i; j++) {
|
||||||
|
encodedTitle[j] = byteArray[i+j];
|
||||||
|
}
|
||||||
|
|
||||||
|
var title = evopedia.Title.parseTitle(encodedTitle, new evopedia.LocalArchive(), i);
|
||||||
|
|
||||||
// Skip the first title
|
// Skip the first title
|
||||||
if (titleNumber>=0 && title) {
|
if (titleNumber>=0 && title) {
|
||||||
debug("Found title : escape1="+escape1+" escape2="+escape2+" filenumber="+filenumber+" blockstart="+blockstart+" blockoffset="+blockoffset+" length="+length+" title="+title);
|
|
||||||
// TODO : check if the title starts with prefix, and return if it does not
|
// TODO : check if the title starts with prefix, and return if it does not
|
||||||
comboTitleList.options[titleNumber] = new Option (title, filenumber+"|"+blockstart+"|"+blockoffset+"|"+length);
|
comboTitleList.options[titleNumber] = new Option (title.name, title.fileNr + "|" + title.blockStart + "|" + title.blockOffset + "|" + title.articleLength);
|
||||||
|
debug("Title : startIndex = " + i + " endIndex = " + newLineIndex + " title.name = " + title.name + " title.fileNr = " + title.fileNr + " title.blockStart = " + title.blockStart + " title.blockOffset = " + title.blockOffset + " title.articleLength = " + title.articleLength);
|
||||||
}
|
}
|
||||||
titleNumber++;
|
titleNumber++;
|
||||||
i=newLineIndex-1;
|
i=newLineIndex+1;
|
||||||
}
|
}
|
||||||
// Update the offsets, as if the first item of the list was selected by the user
|
// Update the offsets, as if the first item of the list was selected by the user
|
||||||
updateOffsetsFromTitle($('#titleList').val());
|
updateOffsetsFromTitle($('#titleList').val());
|
||||||
|
@ -253,9 +253,9 @@ bzip2.decompress = function(bits, size, len){
|
|||||||
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
array: function(bytes) { return bzip2.array(bytes);},
|
array: bzip2.array,
|
||||||
simple: function(bits) { return bzip2.simple(bits);},
|
simple: bzip2.simple,
|
||||||
header: function(bits) { return bzip2.header(bits);},
|
header: bzip2.header,
|
||||||
decompress: function(bits, size, len) { return bzip2.decompress(bits, size, len);}
|
decompress: bzip2.decompress
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
131
www/js/lib/evopedia.js
Normal file
131
www/js/lib/evopedia.js
Normal file
@ -0,0 +1,131 @@
|
|||||||
|
define(function(require) {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read an integer encoded in 4 bytes
|
||||||
|
*/
|
||||||
|
function readIntegerFrom4Bytes(byteArray,firstIndex) {
|
||||||
|
return byteArray[firstIndex] + byteArray[firstIndex+1]*256 + byteArray[firstIndex+2]*65536 + byteArray[firstIndex+3]*16777216;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts a UTF-8 byte array to JavaScript's 16-bit Unicode.
|
||||||
|
* @param {Array.<number>} bytes UTF-8 byte array.
|
||||||
|
* @return {string} 16-bit Unicode string.
|
||||||
|
* Copied from http://closure-library.googlecode.com/svn/docs/closure_goog_crypt.js.source.html (Apache License 2.0)
|
||||||
|
*/
|
||||||
|
function utf8ByteArrayToString(bytes,startIndex,endIndex) {
|
||||||
|
var out = [], pos = startIndex, c = 0;
|
||||||
|
while (pos < bytes.length && pos < endIndex) {
|
||||||
|
var c1 = bytes[pos++];
|
||||||
|
if (c1 < 128) {
|
||||||
|
out[c++] = String.fromCharCode(c1);
|
||||||
|
} else if (c1 > 191 && c1 < 224) {
|
||||||
|
var c2 = bytes[pos++];
|
||||||
|
out[c++] = String.fromCharCode((c1 & 31) << 6 | c2 & 63);
|
||||||
|
} else {
|
||||||
|
var c2 = bytes[pos++];
|
||||||
|
var c3 = bytes[pos++];
|
||||||
|
out[c++] = String.fromCharCode(
|
||||||
|
(c1 & 15) << 12 | (c2 & 63) << 6 | c3 & 63);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out.join('');
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* LocalArchive class : defines a wikipedia dump on the filesystem
|
||||||
|
* It's still minimal for now. TODO : complete implementation to handle maths and coordinates
|
||||||
|
*/
|
||||||
|
function LocalArchive() {
|
||||||
|
this.directory = null;
|
||||||
|
this.titleFile = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Title class : defines the title of an article and some methods to manipulate it
|
||||||
|
*/
|
||||||
|
function Title() {
|
||||||
|
this.name = null;
|
||||||
|
this.fileNr = null;
|
||||||
|
this.blockStart = null;
|
||||||
|
this.blockOffset = null;
|
||||||
|
this.articleLength = null;
|
||||||
|
this.archive = null;
|
||||||
|
this.titleOffset = null;
|
||||||
|
this.titleEntryLength = null;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a Title instance from an encoded title line from a title file
|
||||||
|
*/
|
||||||
|
Title.parseTitle = function(encodedTitle, archive, titleOffset) {
|
||||||
|
if (archive == null) {
|
||||||
|
throw "archive cannot be null";
|
||||||
|
}
|
||||||
|
if (titleOffset < 0) {
|
||||||
|
throw "titleOffset cannot be negative (was " + titleOffset + ")";
|
||||||
|
}
|
||||||
|
var t = new Title();
|
||||||
|
t.archive = archive;
|
||||||
|
t.titleOffset = titleOffset;
|
||||||
|
|
||||||
|
if (encodedTitle == null || encodedTitle.length < 15)
|
||||||
|
return null;
|
||||||
|
|
||||||
|
if (encodedTitle[encodedTitle.length - 1] == '\n') {
|
||||||
|
t.titleEntryLength = encodedTitle.length;
|
||||||
|
} else {
|
||||||
|
t.titleEntryLength = encodedTitle.length + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO : handle escapes
|
||||||
|
/*
|
||||||
|
int escapes = LittleEndianReader.readUInt16(encodedTitle, 0);
|
||||||
|
byte[] positionData = new byte[13];
|
||||||
|
System.arraycopy(encodedTitle, 2, positionData, 0, 13);
|
||||||
|
|
||||||
|
if ((escapes & (1 << 14)) != 0)
|
||||||
|
escapes |= '\n';
|
||||||
|
|
||||||
|
for (int i = 0; i < 13; i ++) {
|
||||||
|
if ((escapes & (1 << i)) != 0)
|
||||||
|
positionData[i] = '\n';
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
t.fileNr = encodedTitle[2];
|
||||||
|
t.blockStart = readIntegerFrom4Bytes(encodedTitle, 3);
|
||||||
|
t.blockOffset = readIntegerFrom4Bytes(encodedTitle, 7);
|
||||||
|
t.articleLength = readIntegerFrom4Bytes(encodedTitle, 11);
|
||||||
|
|
||||||
|
t.name = Title.parseNameOnly(encodedTitle);
|
||||||
|
|
||||||
|
return t;
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Retrieves the name of an article from an encoded title line
|
||||||
|
*/
|
||||||
|
Title.parseNameOnly = function(encodedTitle) {
|
||||||
|
var len = encodedTitle.length;
|
||||||
|
if (len < 15) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
if (len > 15 && encodedTitle[len - 1] == '\n') {
|
||||||
|
len--;
|
||||||
|
}
|
||||||
|
return utf8ByteArrayToString(encodedTitle, 15, len);
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Functions and classes exposed by this module
|
||||||
|
*/
|
||||||
|
return {
|
||||||
|
readIntegerFrom4Bytes: readIntegerFrom4Bytes,
|
||||||
|
utf8ByteArrayToString : utf8ByteArrayToString,
|
||||||
|
LocalArchive : LocalArchive,
|
||||||
|
Title : Title
|
||||||
|
};
|
||||||
|
});
|
@ -102,8 +102,6 @@ function normalizeString(string) {
|
|||||||
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
normalizeString: function(string) {
|
normalizeString: normalizeString
|
||||||
return normalizeString(string);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
Loading…
x
Reference in New Issue
Block a user