From 59ba7bf1cc1a74e0a45cf763c1042a1fcbab83d4 Mon Sep 17 00:00:00 2001 From: mossroy Date: Mon, 20 May 2013 16:16:16 +0200 Subject: [PATCH] Optimization of the algorithm that reads an article, so that it does not read the entire dump for a single article. It was causing crashes on devices (probably because they don't have enough memory), and was very slow, even on a computer. Now, the algorithm reads the file by chunks of 128 KB. Usually, 128 KB seems to be enough to read the entire article --- www/index.html | 8 +-- www/js/app.js | 4 +- www/js/lib/evopedia.js | 116 +++++++++++++++++++++++++++-------------- 3 files changed, 82 insertions(+), 46 deletions(-) diff --git a/www/index.html b/www/index.html index 116b11c3..08716f35 100644 --- a/www/index.html +++ b/www/index.html @@ -48,13 +48,13 @@ License:
To use it, you have to first download locally a dump from http://dumpathome.evopedia.info/dumps/finished (with a Bittorrent client), and select some of the dowloaded files below.
- I have tested it with the small dump (2010-08-14), the French dump (2012-02-03), the French wiktionary dump (2011-03-16) and the English dump (2012-02-11) + I have tested it with the small dump (2010-08-14), the French dump (2013-02-16), the French wiktionary dump (2011-03-16) and the English dump (2012-02-11)


It's only a proof of concept so far : there are many many ways this could be enhanced (suggestions and patches are welcome : the source code is on github). In particular : @@ -62,7 +62,7 @@ License:
  • The performance has to be optimized when reading an article
  • Some searches (for example with prefix "a" on the French dump) do not give any result even if they should
  • In some cases, the links inside an article do not work, or do not lead to the right article
  • -
  • On a real device (with small memory) and a normal dump, reading an article crashes because it loads too many things in memory
  • +
  • On a real device, reading an article sometimes crashes because it loads too many things in memory
  • It is hardly usable on a device because the buttons and inputs are too small
  • Following the links in an article does not populate the history of the browser, which prevents the use of the back button
  • diff --git a/www/js/app.js b/www/js/app.js index e5eb58b6..0218b8f3 100644 --- a/www/js/app.js +++ b/www/js/app.js @@ -42,8 +42,8 @@ define(function(require) { } if (storage != null) { - var directory = 'wikipedia_small_2010-08-14'; - //var directory = 'evopedia/wikipedia_fr_2012-02-03'; + //var directory = 'evopedia/wikipedia_small_2010-08-14'; + var directory = 'evopedia/wikipedia_fr_2013-02-16'; localArchive = new evopedia.LocalArchive(); localArchive.readTitleFile(storage, directory); localArchive.readDataFiles(storage, directory, 0); diff --git a/www/js/lib/evopedia.js b/www/js/lib/evopedia.js index 17f8afa4..d505b21a 100644 --- a/www/js/lib/evopedia.js +++ b/www/js/lib/evopedia.js @@ -4,6 +4,9 @@ define(function(require) { var remove_diacritics = require('remove_diacritics'); var bzip2 = require('bzip2'); + // Size of chunks read in the dump files : 128 KB + const CHUNK_SIZE = 131072; + /** * Read an integer encoded in 4 bytes */ @@ -273,64 +276,97 @@ define(function(require) { }); }; - /** - * Read an article from the title instance, and call the callbackFunction with the article HTML String - * @param title - * @param callbackFunction - */ + + /** + * Read an article from the title instance, and call the + * callbackFunction with the article HTML String + * + * @param title + * @param callbackFunction + */ LocalArchive.prototype.readArticle = function(title, callbackFunction) { var dataFile = null; var prefixedFileNumber = ""; - if (title.fileNr<10) { + if (title.fileNr < 10) { prefixedFileNumber = "0" + title.fileNr; - } - else { + } else { prefixedFileNumber = title.fileNr; } - var expectedFileName = "wikipedia_"+prefixedFileNumber+".dat"; + var expectedFileName = "wikipedia_" + prefixedFileNumber + ".dat"; // Find the good dump file - for (var i=0; i= title.articleLength) { // Keep only length characters - htmlArticle = htmlArticle.substring(0,title.articleLength); + htmlArticle = htmlArticle.substring(0, title.articleLength); // Decode UTF-8 encoding htmlArticle = decodeURIComponent(escape(htmlArticle)); + callbackFunction(htmlArticle); + } else { + // TODO : throw exception if we reach the end of the file + currentLocalArchiveInstance.readArticleChunk(title, dataFile, reader, readLength + CHUNK_SIZE, + callbackFunction); + } + }; + var blob = dataFile.slice(title.blockStart, title.blockStart + + readLength); - callbackFunction (htmlArticle); - }; - - // TODO : should be improved by reading the file chunks by chunks until the article is found, - // instead of reading the whole file starting at blockstart - var blob = dataFile.slice(title.blockStart); - - // Read in the image file as a binary string. - reader.readAsArrayBuffer(blob); - } - + // Read in the image file as a binary string. + reader.readAsArrayBuffer(blob); }; /** @@ -361,7 +397,7 @@ define(function(require) { callbackFunction(redirectedTitle); }; // Read only the 16 necessary bytes, starting at title.blockStart - var blob = titleFile.slice(title.blockStart,title.blockStart+16); + var blob = this.titleFile.slice(title.blockStart,title.blockStart+16); // Read in the file as a binary string reader.readAsArrayBuffer(blob); }; @@ -508,4 +544,4 @@ define(function(require) { LocalArchive : LocalArchive, Title : Title }; -}); \ No newline at end of file +});