Fix offset and length handling for reading an article from the dump

This commit is contained in:
mossroy 2012-12-25 13:25:20 +01:00
parent 199e44756c
commit afbfabac85
2 changed files with 28 additions and 12 deletions

View File

@ -10,27 +10,29 @@
<body>
<h1>Evopedia</h1>
<br/>
Blockstart : <input type="text" id="blockstart" value="0" />
<br/>
Blockoffset : <input type="text" id="blockoffset" value="0" />
<br/>
Length : <input type="text" id="length" value="8866" />
<br/>
<div id="openLocalFiles" style="visibility:hidden">
Please pick the file title.idx from the wikipedia_small_2010-08-14 dump :<br/>
Please pick the file titles.idx from the wikipedia_small_2010-08-14 dump :<br/>
<input type="file" id="titleFile"/><br/>
Please pick the file wikipedia_00.dat from the same dump :<br/>
<input type="file" id="dataFile"/>
</div>
<br/>
<input type="button" id="readTitle" value="Read title list from index" onclick="readAllTitlesFromIndex(titleFile)" />
<input type="button" id="readData" value="Read article from dump" onclick="readArticleFromHtmlForm(dataFile)" />
<br/>
Choose a title : <select id="titleList" onchange="updateOffsetsFromTitle(this.value)"></select>
<br/>
Blockstart : <input type="text" id="blockstart" value="0" />
<br/>
Blockoffset : <input type="text" id="blockoffset" value="0" />
<br/>
Length : <input type="text" id="length" value="8866" />
<br/>
<input type="button" id="readData" value="Read article from dump" onclick="readArticleFromHtmlForm(dataFile)" />
<br/>
<div id="articleContent">&nbsp;</div>
<hr/>
<pre id="rawArticleContent">&nbsp;</pre>
<textarea id="rawArticleContent" cols="80" rows="20">&nbsp;</textarea>
<!--<pre id="rawArticleContent">&nbsp;</pre>-->
<script type="text/javascript" src="evopedia.js"></script>
</body>

View File

@ -91,7 +91,16 @@ function readAllTitlesFromIndex(titleFile) {
for (var j=i+15;j<newLineIndex;j++) {
title += String.fromCharCode(byteArray[j]);
}
// TODO : Read the title properly with UTF-8 encoding
/*
var buf = new ArrayBuffer();
var bufView = new Uint16Array(buf);
for (var j=0;j<newLineIndex-i-15;j++) {
bufView[j]=byteArray[j+i+15];
}
title = String.fromCharCode(bufView);
*/
comboTitleList.options[titleNumber] = new Option (title, filenumber+"|"+blockstart+"|"+blockoffset+"|"+length);
titleNumber++;
i=newLineIndex-1;
@ -135,13 +144,18 @@ function readArticleFromOffset(dataFile, blockstart, blockoffset, length) {
// TODO : should be improved by uncompressing the content chunk by chunk,
// until the length is reached, instead of uncompressing everything
var htmlArticles = bzip2.simple(bzip2.array(new Uint8Array(compressedArticles)));
var htmlArticle = htmlArticles.substring(blockoffset,length);
// Start reading at offset, and keep 2*length bytes (maximum size in bytes for length characters)
var htmlArticle = htmlArticles.substring(blockoffset,blockoffset+length);
// Keep only length characters
htmlArticle = htmlArticle.substring(0,length);
// Decode UTF-8 encoding
htmlArticle = decodeURIComponent(escape(htmlArticle));
document.getElementById('articleContent').innerHTML = htmlArticle;
// For testing purpose
document.getElementById('rawArticleContent').innerHTML = htmlArticle.replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;');
//document.getElementById('rawArticleContent').innerHTML = htmlArticle.replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;');
//document.getElementById('rawArticleContent').value = decodeURIComponent(escape(htmlArticles));
};
//var blob = file;