Fix offset and length handling for reading an article from the dump

This commit is contained in:
mossroy 2012-12-25 13:25:20 +01:00
parent 199e44756c
commit afbfabac85
2 changed files with 28 additions and 12 deletions

View File

@ -10,27 +10,29 @@
<body> <body>
<h1>Evopedia</h1> <h1>Evopedia</h1>
<br/> <br/>
Blockstart : <input type="text" id="blockstart" value="0" />
<br/>
Blockoffset : <input type="text" id="blockoffset" value="0" />
<br/>
Length : <input type="text" id="length" value="8866" />
<br/>
<div id="openLocalFiles" style="visibility:hidden"> <div id="openLocalFiles" style="visibility:hidden">
Please pick the file title.idx from the wikipedia_small_2010-08-14 dump :<br/> Please pick the file titles.idx from the wikipedia_small_2010-08-14 dump :<br/>
<input type="file" id="titleFile"/><br/> <input type="file" id="titleFile"/><br/>
Please pick the file wikipedia_00.dat from the same dump :<br/> Please pick the file wikipedia_00.dat from the same dump :<br/>
<input type="file" id="dataFile"/> <input type="file" id="dataFile"/>
</div> </div>
<br/> <br/>
<input type="button" id="readTitle" value="Read title list from index" onclick="readAllTitlesFromIndex(titleFile)" /> <input type="button" id="readTitle" value="Read title list from index" onclick="readAllTitlesFromIndex(titleFile)" />
<input type="button" id="readData" value="Read article from dump" onclick="readArticleFromHtmlForm(dataFile)" />
<br/> <br/>
Choose a title : <select id="titleList" onchange="updateOffsetsFromTitle(this.value)"></select> Choose a title : <select id="titleList" onchange="updateOffsetsFromTitle(this.value)"></select>
<br/> <br/>
Blockstart : <input type="text" id="blockstart" value="0" />
<br/>
Blockoffset : <input type="text" id="blockoffset" value="0" />
<br/>
Length : <input type="text" id="length" value="8866" />
<br/>
<input type="button" id="readData" value="Read article from dump" onclick="readArticleFromHtmlForm(dataFile)" />
<br/>
<div id="articleContent">&nbsp;</div> <div id="articleContent">&nbsp;</div>
<hr/> <hr/>
<pre id="rawArticleContent">&nbsp;</pre> <textarea id="rawArticleContent" cols="80" rows="20">&nbsp;</textarea>
<!--<pre id="rawArticleContent">&nbsp;</pre>-->
<script type="text/javascript" src="evopedia.js"></script> <script type="text/javascript" src="evopedia.js"></script>
</body> </body>

View File

@ -91,6 +91,15 @@ function readAllTitlesFromIndex(titleFile) {
for (var j=i+15;j<newLineIndex;j++) { for (var j=i+15;j<newLineIndex;j++) {
title += String.fromCharCode(byteArray[j]); title += String.fromCharCode(byteArray[j]);
} }
// TODO : Read the title properly with UTF-8 encoding
/*
var buf = new ArrayBuffer();
var bufView = new Uint16Array(buf);
for (var j=0;j<newLineIndex-i-15;j++) {
bufView[j]=byteArray[j+i+15];
}
title = String.fromCharCode(bufView);
*/
comboTitleList.options[titleNumber] = new Option (title, filenumber+"|"+blockstart+"|"+blockoffset+"|"+length); comboTitleList.options[titleNumber] = new Option (title, filenumber+"|"+blockstart+"|"+blockoffset+"|"+length);
titleNumber++; titleNumber++;
@ -135,13 +144,18 @@ function readArticleFromOffset(dataFile, blockstart, blockoffset, length) {
// TODO : should be improved by uncompressing the content chunk by chunk, // TODO : should be improved by uncompressing the content chunk by chunk,
// until the length is reached, instead of uncompressing everything // until the length is reached, instead of uncompressing everything
var htmlArticles = bzip2.simple(bzip2.array(new Uint8Array(compressedArticles))); var htmlArticles = bzip2.simple(bzip2.array(new Uint8Array(compressedArticles)));
var htmlArticle = htmlArticles.substring(blockoffset,length); // Start reading at offset, and keep 2*length bytes (maximum size in bytes for length characters)
var htmlArticle = htmlArticles.substring(blockoffset,blockoffset+length);
// Keep only length characters
htmlArticle = htmlArticle.substring(0,length);
// Decode UTF-8 encoding // Decode UTF-8 encoding
htmlArticle = decodeURIComponent(escape(htmlArticle)); htmlArticle = decodeURIComponent(escape(htmlArticle));
document.getElementById('articleContent').innerHTML = htmlArticle; document.getElementById('articleContent').innerHTML = htmlArticle;
// For testing purpose // For testing purpose
document.getElementById('rawArticleContent').innerHTML = htmlArticle.replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;'); //document.getElementById('rawArticleContent').innerHTML = htmlArticle.replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;');
//document.getElementById('rawArticleContent').value = decodeURIComponent(escape(htmlArticles));
}; };
//var blob = file; //var blob = file;