mirror of
https://github.com/kiwix/kiwix-js.git
synced 2025-09-21 11:27:00 -04:00
Started refactoring the code in a more object-oriented way
Fixed the algorithm on the English dump
This commit is contained in:
parent
5260b75587
commit
66e42247ff
@ -48,7 +48,7 @@ License:
|
||||
<br />
|
||||
To use it, you have to first download locally a dump from <a href="http://dumpathome.evopedia.info/dumps/finished">http://dumpathome.evopedia.info/dumps/finished</a> (with a Bittorrent client), and select some of the dowloaded files below.
|
||||
<br />
|
||||
Current status : I have tested it with the <a href="http://evopedia.info/dumps/wikipedia_small_2010-08-14.torrent">small dump (2010-08-14)</a>, the <a href="http://evopedia.info/dumps/wikipedia_fr_2012-02-03.torrent">French dump (2012-02-03)</a> and the <a href="http://evopedia.info/dumps/wikipedia_frwiktionary_2011-03-16.torrent">French wiktionary dump (2011-03-16)</a>. <b>It does NOT work on the English dump for now</b> (I'm working on it)
|
||||
Current status : I have tested it with the <a href="http://evopedia.info/dumps/wikipedia_small_2010-08-14.torrent">small dump (2010-08-14)</a>, the <a href="http://evopedia.info/dumps/wikipedia_fr_2012-02-03.torrent">French dump (2012-02-03)</a>, the <a href="http://evopedia.info/dumps/wikipedia_frwiktionary_2011-03-16.torrent">French wiktionary dump (2011-03-16)</a> and the <a href="http://evopedia.info/dumps/wikipedia_en_2012-02-11.torrent">English dump (2012-02-11)</a>
|
||||
<br />
|
||||
<br />
|
||||
<ul>
|
||||
|
@ -20,6 +20,7 @@ define(function(require) {
|
||||
// Evopedia javascript dependencies
|
||||
var bzip2 = require('bzip2');
|
||||
var remove_diacritics = require('remove_diacritics');
|
||||
var evopedia = require('evopedia');
|
||||
|
||||
|
||||
var dataFiles=document.getElementById('dataFiles').files;
|
||||
@ -126,38 +127,6 @@ function updateOffsetsFromTitle(selectValue) {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read an integer encoded in 4 bytes
|
||||
*/
|
||||
function readIntegerFrom4Bytes(byteArray,firstIndex) {
|
||||
return byteArray[firstIndex] + byteArray[firstIndex+1]*256 + byteArray[firstIndex+2]*65536 + byteArray[firstIndex+3]*16777216;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a UTF-8 byte array to JavaScript's 16-bit Unicode.
|
||||
* @param {Array.<number>} bytes UTF-8 byte array.
|
||||
* @return {string} 16-bit Unicode string.
|
||||
* Copied from http://closure-library.googlecode.com/svn/docs/closure_goog_crypt.js.source.html (Apache License 2.0)
|
||||
*/
|
||||
function utf8ByteArrayToString(bytes,startIndex,endIndex) {
|
||||
var out = [], pos = startIndex, c = 0;
|
||||
while (pos < bytes.length && pos < endIndex) {
|
||||
var c1 = bytes[pos++];
|
||||
if (c1 < 128) {
|
||||
out[c++] = String.fromCharCode(c1);
|
||||
} else if (c1 > 191 && c1 < 224) {
|
||||
var c2 = bytes[pos++];
|
||||
out[c++] = String.fromCharCode((c1 & 31) << 6 | c2 & 63);
|
||||
} else {
|
||||
var c2 = bytes[pos++];
|
||||
var c3 = bytes[pos++];
|
||||
out[c++] = String.fromCharCode(
|
||||
(c1 & 15) << 12 | (c2 & 63) << 6 | c3 & 63);
|
||||
}
|
||||
}
|
||||
return out.join('');
|
||||
};
|
||||
|
||||
/**
|
||||
* This function is recursively called after each asynchronous read,
|
||||
* so that to find the closest index in titleFile to the given prefix
|
||||
@ -172,16 +141,16 @@ function recursivePrefixSearch(titleFile, reader, prefix, lo, hi) {
|
||||
var byteArray = new Uint8Array(binaryTitleFile);
|
||||
// Look for the index of the next NewLine
|
||||
var newLineIndex=0;
|
||||
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=128) {
|
||||
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=10) {
|
||||
newLineIndex++;
|
||||
}
|
||||
var i = newLineIndex-1;
|
||||
var i = newLineIndex+1;
|
||||
newLineIndex = i+15;
|
||||
// Look for the index of the next NewLine
|
||||
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=128) {
|
||||
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=10) {
|
||||
newLineIndex++;
|
||||
}
|
||||
var title = utf8ByteArrayToString(byteArray,i+15,newLineIndex);
|
||||
var title = evopedia.utf8ByteArrayToString(byteArray,i+15,newLineIndex);
|
||||
debug("title found : "+title);
|
||||
if (title.localeCompare(prefix)<0) {
|
||||
lo = mid;
|
||||
@ -240,9 +209,9 @@ function readRedirectOffsets(titleFile,redirectIndex) {
|
||||
var byteArray = new Uint8Array(binaryTitleFile);
|
||||
var filenumber = byteArray[2];
|
||||
|
||||
var blockstart = readIntegerFrom4Bytes(byteArray,3);
|
||||
var blockoffset = readIntegerFrom4Bytes(byteArray,7);
|
||||
var length = readIntegerFrom4Bytes(byteArray,11);
|
||||
var blockstart = evopedia.readIntegerFrom4Bytes(byteArray,3);
|
||||
var blockoffset = evopedia.readIntegerFrom4Bytes(byteArray,7);
|
||||
var length = evopedia.readIntegerFrom4Bytes(byteArray,11);
|
||||
|
||||
document.getElementById('redirectfilenumber').value = filenumber;
|
||||
document.getElementById('redirectblockstart').value = blockstart;
|
||||
@ -272,42 +241,35 @@ function readTitlesBeginningAtIndexStartingWithPrefix(titleFile,prefix,startInde
|
||||
var byteArray = new Uint8Array(binaryTitleFile);
|
||||
// Look for the index of the next NewLine
|
||||
var newLineIndex=0;
|
||||
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=128) {
|
||||
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=10) {
|
||||
newLineIndex++;
|
||||
}
|
||||
var i = newLineIndex;
|
||||
var titleNumber=-1;
|
||||
var comboTitleList = document.getElementById('titleList');
|
||||
while (i<byteArray.length && titleNumber<50) {
|
||||
var filenumber = 0;
|
||||
var blockstart = 0;
|
||||
var blockoffset = 0;
|
||||
var length = 0;
|
||||
var title = "";
|
||||
|
||||
// TODO : interpret escape area
|
||||
var escape1 = byteArray[i];
|
||||
var escape2 = byteArray[i+1];
|
||||
filenumber = byteArray[i+2];
|
||||
|
||||
blockstart = readIntegerFrom4Bytes(byteArray,i+3);
|
||||
blockoffset = readIntegerFrom4Bytes(byteArray,i+7);
|
||||
length = readIntegerFrom4Bytes(byteArray,i+11);
|
||||
var newLineIndex = i+15;
|
||||
|
||||
// Look for the index of the next NewLine
|
||||
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=128) {
|
||||
// Look for the index of the next NewLine
|
||||
newLineIndex+=15;
|
||||
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=10) {
|
||||
newLineIndex++;
|
||||
}
|
||||
title = utf8ByteArrayToString(byteArray,i+15,newLineIndex);
|
||||
|
||||
// Copy the encodedTitle in a new Array
|
||||
var encodedTitle = new Uint8Array(newLineIndex-i);
|
||||
for (var j = 0; j < newLineIndex-i; j++) {
|
||||
encodedTitle[j] = byteArray[i+j];
|
||||
}
|
||||
|
||||
var title = evopedia.Title.parseTitle(encodedTitle, new evopedia.LocalArchive(), i);
|
||||
|
||||
// Skip the first title
|
||||
if (titleNumber>=0 && title) {
|
||||
debug("Found title : escape1="+escape1+" escape2="+escape2+" filenumber="+filenumber+" blockstart="+blockstart+" blockoffset="+blockoffset+" length="+length+" title="+title);
|
||||
// TODO : check if the title starts with prefix, and return if it does not
|
||||
comboTitleList.options[titleNumber] = new Option (title, filenumber+"|"+blockstart+"|"+blockoffset+"|"+length);
|
||||
comboTitleList.options[titleNumber] = new Option (title.name, title.fileNr + "|" + title.blockStart + "|" + title.blockOffset + "|" + title.articleLength);
|
||||
debug("Title : startIndex = " + i + " endIndex = " + newLineIndex + " title.name = " + title.name + " title.fileNr = " + title.fileNr + " title.blockStart = " + title.blockStart + " title.blockOffset = " + title.blockOffset + " title.articleLength = " + title.articleLength);
|
||||
}
|
||||
titleNumber++;
|
||||
i=newLineIndex-1;
|
||||
i=newLineIndex+1;
|
||||
}
|
||||
// Update the offsets, as if the first item of the list was selected by the user
|
||||
updateOffsetsFromTitle($('#titleList').val());
|
||||
|
@ -253,9 +253,9 @@ bzip2.decompress = function(bits, size, len){
|
||||
|
||||
|
||||
return {
|
||||
array: function(bytes) { return bzip2.array(bytes);},
|
||||
simple: function(bits) { return bzip2.simple(bits);},
|
||||
header: function(bits) { return bzip2.header(bits);},
|
||||
decompress: function(bits, size, len) { return bzip2.decompress(bits, size, len);}
|
||||
array: bzip2.array,
|
||||
simple: bzip2.simple,
|
||||
header: bzip2.header,
|
||||
decompress: bzip2.decompress
|
||||
};
|
||||
});
|
||||
|
131
www/js/lib/evopedia.js
Normal file
131
www/js/lib/evopedia.js
Normal file
@ -0,0 +1,131 @@
|
||||
define(function(require) {
|
||||
|
||||
/**
|
||||
* Read an integer encoded in 4 bytes
|
||||
*/
|
||||
function readIntegerFrom4Bytes(byteArray,firstIndex) {
|
||||
return byteArray[firstIndex] + byteArray[firstIndex+1]*256 + byteArray[firstIndex+2]*65536 + byteArray[firstIndex+3]*16777216;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a UTF-8 byte array to JavaScript's 16-bit Unicode.
|
||||
* @param {Array.<number>} bytes UTF-8 byte array.
|
||||
* @return {string} 16-bit Unicode string.
|
||||
* Copied from http://closure-library.googlecode.com/svn/docs/closure_goog_crypt.js.source.html (Apache License 2.0)
|
||||
*/
|
||||
function utf8ByteArrayToString(bytes,startIndex,endIndex) {
|
||||
var out = [], pos = startIndex, c = 0;
|
||||
while (pos < bytes.length && pos < endIndex) {
|
||||
var c1 = bytes[pos++];
|
||||
if (c1 < 128) {
|
||||
out[c++] = String.fromCharCode(c1);
|
||||
} else if (c1 > 191 && c1 < 224) {
|
||||
var c2 = bytes[pos++];
|
||||
out[c++] = String.fromCharCode((c1 & 31) << 6 | c2 & 63);
|
||||
} else {
|
||||
var c2 = bytes[pos++];
|
||||
var c3 = bytes[pos++];
|
||||
out[c++] = String.fromCharCode(
|
||||
(c1 & 15) << 12 | (c2 & 63) << 6 | c3 & 63);
|
||||
}
|
||||
}
|
||||
return out.join('');
|
||||
}
|
||||
|
||||
/**
|
||||
* LocalArchive class : defines a wikipedia dump on the filesystem
|
||||
* It's still minimal for now. TODO : complete implementation to handle maths and coordinates
|
||||
*/
|
||||
function LocalArchive() {
|
||||
this.directory = null;
|
||||
this.titleFile = null;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Title class : defines the title of an article and some methods to manipulate it
|
||||
*/
|
||||
function Title() {
|
||||
this.name = null;
|
||||
this.fileNr = null;
|
||||
this.blockStart = null;
|
||||
this.blockOffset = null;
|
||||
this.articleLength = null;
|
||||
this.archive = null;
|
||||
this.titleOffset = null;
|
||||
this.titleEntryLength = null;
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Creates a Title instance from an encoded title line from a title file
|
||||
*/
|
||||
Title.parseTitle = function(encodedTitle, archive, titleOffset) {
|
||||
if (archive == null) {
|
||||
throw "archive cannot be null";
|
||||
}
|
||||
if (titleOffset < 0) {
|
||||
throw "titleOffset cannot be negative (was " + titleOffset + ")";
|
||||
}
|
||||
var t = new Title();
|
||||
t.archive = archive;
|
||||
t.titleOffset = titleOffset;
|
||||
|
||||
if (encodedTitle == null || encodedTitle.length < 15)
|
||||
return null;
|
||||
|
||||
if (encodedTitle[encodedTitle.length - 1] == '\n') {
|
||||
t.titleEntryLength = encodedTitle.length;
|
||||
} else {
|
||||
t.titleEntryLength = encodedTitle.length + 1;
|
||||
}
|
||||
|
||||
// TODO : handle escapes
|
||||
/*
|
||||
int escapes = LittleEndianReader.readUInt16(encodedTitle, 0);
|
||||
byte[] positionData = new byte[13];
|
||||
System.arraycopy(encodedTitle, 2, positionData, 0, 13);
|
||||
|
||||
if ((escapes & (1 << 14)) != 0)
|
||||
escapes |= '\n';
|
||||
|
||||
for (int i = 0; i < 13; i ++) {
|
||||
if ((escapes & (1 << i)) != 0)
|
||||
positionData[i] = '\n';
|
||||
}
|
||||
*/
|
||||
|
||||
t.fileNr = encodedTitle[2];
|
||||
t.blockStart = readIntegerFrom4Bytes(encodedTitle, 3);
|
||||
t.blockOffset = readIntegerFrom4Bytes(encodedTitle, 7);
|
||||
t.articleLength = readIntegerFrom4Bytes(encodedTitle, 11);
|
||||
|
||||
t.name = Title.parseNameOnly(encodedTitle);
|
||||
|
||||
return t;
|
||||
};
|
||||
|
||||
/*
|
||||
* Retrieves the name of an article from an encoded title line
|
||||
*/
|
||||
Title.parseNameOnly = function(encodedTitle) {
|
||||
var len = encodedTitle.length;
|
||||
if (len < 15) {
|
||||
return null;
|
||||
}
|
||||
if (len > 15 && encodedTitle[len - 1] == '\n') {
|
||||
len--;
|
||||
}
|
||||
return utf8ByteArrayToString(encodedTitle, 15, len);
|
||||
};
|
||||
|
||||
/**
|
||||
* Functions and classes exposed by this module
|
||||
*/
|
||||
return {
|
||||
readIntegerFrom4Bytes: readIntegerFrom4Bytes,
|
||||
utf8ByteArrayToString : utf8ByteArrayToString,
|
||||
LocalArchive : LocalArchive,
|
||||
Title : Title
|
||||
};
|
||||
});
|
@ -102,8 +102,6 @@ function normalizeString(string) {
|
||||
|
||||
|
||||
return {
|
||||
normalizeString: function(string) {
|
||||
return normalizeString(string);
|
||||
}
|
||||
normalizeString: normalizeString
|
||||
};
|
||||
});
|
||||
|
Loading…
x
Reference in New Issue
Block a user