mirror of
https://github.com/kiwix/kiwix-js.git
synced 2025-09-22 03:52:21 -04:00
Optimization of the algorithm that reads an article, so that it does not
read the entire dump for a single article. It was causing crashes on devices (probably because they don't have enough memory), and was very slow, even on a computer. Now, the algorithm reads the file by chunks of 128 KB. Usually, 128 KB seems to be enough to read the entire article
This commit is contained in:
parent
f643192a9b
commit
59ba7bf1cc
@ -48,13 +48,13 @@ License:
|
||||
<br />
|
||||
To use it, you have to first download locally a dump from <a href="http://dumpathome.evopedia.info/dumps/finished">http://dumpathome.evopedia.info/dumps/finished</a> (with a Bittorrent client), and select some of the dowloaded files below.
|
||||
<br />
|
||||
I have tested it with the <a href="http://evopedia.info/dumps/wikipedia_small_2010-08-14.torrent">small dump (2010-08-14)</a>, the <a href="http://evopedia.info/dumps/wikipedia_fr_2012-02-03.torrent">French dump (2012-02-03)</a>, the <a href="http://evopedia.info/dumps/wikipedia_frwiktionary_2011-03-16.torrent">French wiktionary dump (2011-03-16)</a> and the <a href="http://evopedia.info/dumps/wikipedia_en_2012-02-11.torrent">English dump (2012-02-11)</a>
|
||||
I have tested it with the <a href="http://evopedia.info/dumps/wikipedia_small_2010-08-14.torrent">small dump (2010-08-14)</a>, the <a href="http://evopedia.info/dumps/wikipedia_fr_2013-02-16.torrent">French dump (2013-02-16)</a>, the <a href="http://evopedia.info/dumps/wikipedia_frwiktionary_2011-03-16.torrent">French wiktionary dump (2011-03-16)</a> and the <a href="http://evopedia.info/dumps/wikipedia_en_2012-02-11.torrent">English dump (2012-02-11)</a>
|
||||
<br />
|
||||
<br />
|
||||
<ul>
|
||||
<li>On desktops, it works on recent Firefox and Chrome, and maybe on other browsers</li>
|
||||
<li>On the Firefos OS simulator, you have (for now) to put the small dump files in a "fake-sdcard" folder of your firefox profile (ex : ~/.mozilla/firefox/xxxx.default/extensions/r2d2b2g@mozilla.org/profile/fake-sdcard). It looks for wikipedia_small_2010-08-14/titles.idx in it. You also need to install the application from the dashboard of the simulator instead of accessing via the browser (due to security restrictions in Firefox OS : only certified webapps can access the sdcard)</li>
|
||||
<li>On a real Firefox OS device, you also have (for now) to put the small dump files at the root of your sdcard, so that it finds a file /wikipedia_small_2010-08-14/titles.idx on it</li>
|
||||
<li>On the Firefos OS simulator, you have (for now) to put the French dump files in a "fake-sdcard/evopedia" folder of your firefox profile (ex : ~/.mozilla/firefox/xxxx.default/extensions/r2d2b2g@mozilla.org/profile/fake-sdcard). It looks for evopedia/wikipedia_fr_2013-02-16/titles.idx in it. You also need to install the application from the dashboard of the simulator instead of accessing via the browser (due to security restrictions in Firefox OS : only certified webapps can access the sdcard)</li>
|
||||
<li>On a real Firefox OS device, you also have (for now) to put the French dump files in an "evopedia" directory at the root of your sdcard, so that it finds a file /evopedia/wikipedia_fr_2013-02-16/titles.idx on it</li>
|
||||
</ul>
|
||||
<br />
|
||||
It's only a proof of concept so far : there are many many ways this could be enhanced (suggestions and patches are welcome : the source code is on <a href="https://github.com/mossroy/evopedia-html5">github</a>). In particular :
|
||||
@ -62,7 +62,7 @@ License:
|
||||
<li>The performance has to be optimized when reading an article</li>
|
||||
<li>Some searches (for example with prefix "a" on the French dump) do not give any result even if they should</li>
|
||||
<li>In some cases, the links inside an article do not work, or do not lead to the right article</li>
|
||||
<li>On a real device (with small memory) and a normal dump, reading an article crashes because it loads too many things in memory</li>
|
||||
<li>On a real device, reading an article sometimes crashes because it loads too many things in memory</li>
|
||||
<li>It is hardly usable on a device because the buttons and inputs are too small</li>
|
||||
<li>Following the links in an article does not populate the history of the browser, which prevents the use of the back button</li>
|
||||
</ul>
|
||||
|
@ -42,8 +42,8 @@ define(function(require) {
|
||||
}
|
||||
|
||||
if (storage != null) {
|
||||
var directory = 'wikipedia_small_2010-08-14';
|
||||
//var directory = 'evopedia/wikipedia_fr_2012-02-03';
|
||||
//var directory = 'evopedia/wikipedia_small_2010-08-14';
|
||||
var directory = 'evopedia/wikipedia_fr_2013-02-16';
|
||||
localArchive = new evopedia.LocalArchive();
|
||||
localArchive.readTitleFile(storage, directory);
|
||||
localArchive.readDataFiles(storage, directory, 0);
|
||||
|
@ -4,6 +4,9 @@ define(function(require) {
|
||||
var remove_diacritics = require('remove_diacritics');
|
||||
var bzip2 = require('bzip2');
|
||||
|
||||
// Size of chunks read in the dump files : 128 KB
|
||||
const CHUNK_SIZE = 131072;
|
||||
|
||||
/**
|
||||
* Read an integer encoded in 4 bytes
|
||||
*/
|
||||
@ -273,64 +276,97 @@ define(function(require) {
|
||||
});
|
||||
};
|
||||
|
||||
/**
|
||||
* Read an article from the title instance, and call the callbackFunction with the article HTML String
|
||||
* @param title
|
||||
* @param callbackFunction
|
||||
*/
|
||||
|
||||
/**
|
||||
* Read an article from the title instance, and call the
|
||||
* callbackFunction with the article HTML String
|
||||
*
|
||||
* @param title
|
||||
* @param callbackFunction
|
||||
*/
|
||||
LocalArchive.prototype.readArticle = function(title, callbackFunction) {
|
||||
var dataFile = null;
|
||||
|
||||
var prefixedFileNumber = "";
|
||||
if (title.fileNr<10) {
|
||||
if (title.fileNr < 10) {
|
||||
prefixedFileNumber = "0" + title.fileNr;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
prefixedFileNumber = title.fileNr;
|
||||
}
|
||||
var expectedFileName = "wikipedia_"+prefixedFileNumber+".dat";
|
||||
var expectedFileName = "wikipedia_" + prefixedFileNumber + ".dat";
|
||||
|
||||
// Find the good dump file
|
||||
for (var i=0; i<this.dataFiles.length; i++) {
|
||||
for ( var i = 0; i < this.dataFiles.length; i++) {
|
||||
var fileName = this.dataFiles[i].name;
|
||||
// Check if the fileName ends with the expected file name (in case of DeviceStorage usage, the fileName is prefixed by the directory)
|
||||
if (fileName.match(expectedFileName+"$") == expectedFileName) {
|
||||
// Check if the fileName ends with the expected file name (in case
|
||||
// of DeviceStorage usage, the fileName is prefixed by the
|
||||
// directory)
|
||||
if (fileName.match(expectedFileName + "$") == expectedFileName) {
|
||||
dataFile = this.dataFiles[i];
|
||||
}
|
||||
}
|
||||
if (!dataFile) {
|
||||
throw "File number " + title.fileNr + " not found";
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
var reader = new FileReader();
|
||||
reader.onerror = errorHandler;
|
||||
reader.onabort = function(e) {
|
||||
alert('Data file read cancelled');
|
||||
};
|
||||
reader.onload = function(e) {
|
||||
var compressedArticles = e.target.result;
|
||||
//var htmlArticle = ArchUtils.bz2.decode(compressedArticles);
|
||||
// TODO : should be improved by uncompressing the content chunk by chunk,
|
||||
// until the length is reached, instead of uncompressing everything
|
||||
var htmlArticles = bzip2.simple(bzip2.array(new Uint8Array(compressedArticles)));
|
||||
// Start reading at offset, and keep length characters
|
||||
var htmlArticle = htmlArticles.substring(title.blockOffset,title.blockOffset + title.articleLength);
|
||||
// Read the article in the dataFile, starting with a chunk of CHUNK_SIZE
|
||||
this.readArticleChunk(title, dataFile, reader, CHUNK_SIZE, callbackFunction);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
/**
|
||||
* Read a chunk of the dataFile (of the given length) to try to read the
|
||||
* given article.
|
||||
* If the bzip2 algorithm works and articleLength of the article is reached,
|
||||
* call the callbackFunction with the article HTML String.
|
||||
* Else, recursively call this function with readLength + CHUNK_SIZE
|
||||
*
|
||||
* @param title
|
||||
* @param dataFile
|
||||
* @param reader
|
||||
* @param readLength
|
||||
* @param callbackFunction
|
||||
*/
|
||||
LocalArchive.prototype.readArticleChunk = function(title, dataFile, reader,
|
||||
readLength, callbackFunction) {
|
||||
var currentLocalArchiveInstance = this;
|
||||
reader.onerror = errorHandler;
|
||||
reader.onabort = function(e) {
|
||||
alert('Data file read cancelled');
|
||||
};
|
||||
reader.onload = function(e) {
|
||||
var compressedArticles = e.target.result;
|
||||
var htmlArticles;
|
||||
try {
|
||||
htmlArticles = bzip2.simple(bzip2.array(new Uint8Array(
|
||||
compressedArticles)));
|
||||
} catch (e) {
|
||||
// TODO : rethrow exception if we reach the end of the file
|
||||
currentLocalArchiveInstance.readArticleChunk(title, dataFile, reader, readLength + CHUNK_SIZE,
|
||||
callbackFunction);
|
||||
return;
|
||||
}
|
||||
// Start reading at offset, and keep length characters
|
||||
var htmlArticle = htmlArticles.substring(title.blockOffset,
|
||||
title.blockOffset + title.articleLength);
|
||||
if (htmlArticle.length >= title.articleLength) {
|
||||
// Keep only length characters
|
||||
htmlArticle = htmlArticle.substring(0,title.articleLength);
|
||||
htmlArticle = htmlArticle.substring(0, title.articleLength);
|
||||
// Decode UTF-8 encoding
|
||||
htmlArticle = decodeURIComponent(escape(htmlArticle));
|
||||
callbackFunction(htmlArticle);
|
||||
} else {
|
||||
// TODO : throw exception if we reach the end of the file
|
||||
currentLocalArchiveInstance.readArticleChunk(title, dataFile, reader, readLength + CHUNK_SIZE,
|
||||
callbackFunction);
|
||||
}
|
||||
};
|
||||
var blob = dataFile.slice(title.blockStart, title.blockStart
|
||||
+ readLength);
|
||||
|
||||
callbackFunction (htmlArticle);
|
||||
};
|
||||
|
||||
// TODO : should be improved by reading the file chunks by chunks until the article is found,
|
||||
// instead of reading the whole file starting at blockstart
|
||||
var blob = dataFile.slice(title.blockStart);
|
||||
|
||||
// Read in the image file as a binary string.
|
||||
reader.readAsArrayBuffer(blob);
|
||||
}
|
||||
|
||||
// Read in the image file as a binary string.
|
||||
reader.readAsArrayBuffer(blob);
|
||||
};
|
||||
|
||||
/**
|
||||
@ -361,7 +397,7 @@ define(function(require) {
|
||||
callbackFunction(redirectedTitle);
|
||||
};
|
||||
// Read only the 16 necessary bytes, starting at title.blockStart
|
||||
var blob = titleFile.slice(title.blockStart,title.blockStart+16);
|
||||
var blob = this.titleFile.slice(title.blockStart,title.blockStart+16);
|
||||
// Read in the file as a binary string
|
||||
reader.readAsArrayBuffer(blob);
|
||||
};
|
||||
@ -508,4 +544,4 @@ define(function(require) {
|
||||
LocalArchive : LocalArchive,
|
||||
Title : Title
|
||||
};
|
||||
});
|
||||
});
|
||||
|
Loading…
x
Reference in New Issue
Block a user