Quick and dirty implementation of reading the title list

This commit is contained in:
mossroy 2012-12-09 23:28:40 +01:00
parent 8c59dbe365
commit a2cc7312b4
2 changed files with 114 additions and 20 deletions

View File

@ -2,7 +2,7 @@
<!DOCTYPE html>
<html>
<head>
<meta content="text/html; charset=UTF-8" http-equiv="content-type">
<meta http-equiv="Content-Type" content="application/xhtml+xml; charset=UTF-8" />
<title>Evopedia HTML5</title>
<script type="text/javascript" src="bzip2-antimatter15.js"></script>
</head>
@ -17,11 +17,16 @@ Blockoffset : <input type="text" id="blockoffset" value="0" />
Length : <input type="text" id="length" value="8866" />
<br/>
<div id="openLocalFiles" style="visibility:hidden">
Please pick the file wikipedia_00.dat from the wikipedia_small_2010-08-14 dump :<br>
<input type="file" id="files" name="file" />
Please pick the file title.idx from the wikipedia_small_2010-08-14 dump :<br/>
<input type="file" id="titleFile"/><br/>
Please pick the file wikipedia_00.dat from the same dump :<br/>
<input type="file" id="dataFile"/>
</div>
<br/>
<input type="button" id="read" value="Read from dump" onclick="readArticleFromHtmlForm(file)" />
<input type="button" id="readTitle" value="Read title list from index" onclick="readAllTitlesFromIndex(titleFile)" />
<input type="button" id="readData" value="Read article from dump" onclick="readArticleFromHtmlForm(dataFile)" />
<br/>
Choose a title : <select id="titleList" onchange="updateOffsetsFromTitle(this.value)"></select>
<br/>
<div id="articleContent">&nbsp;</div>
<hr/>

View File

@ -1,5 +1,6 @@
var file=document.getElementById('files').files[0];
var dataFile=document.getElementById('dataFile').files[0];
var titleFile=document.getElementById('titleFile').files[0];
var storage = navigator.getDeviceStorage('music');
//alert(storage);
@ -7,40 +8,124 @@ var storage = navigator.getDeviceStorage('music');
if (!storage) {
//alert("no device storage available");
document.getElementById('openLocalFiles').style.visibility="visible";
document.getElementById('files').addEventListener('change', handleFileSelect, false);
document.getElementById('files').addEventListener('load', handleFileSelect, false);
document.getElementById('dataFile').addEventListener('change', handleDataFileSelect, false);
document.getElementById('titleFile').addEventListener('change', handleTitleFileSelect, false);
}
else {
var filerequest = storage.get('wikipedia_small_2010-08-14/wikipedia_00.dat');
//alert(filerequest);
filerequest.onsuccess = function() {
file = filerequest.result;
//alert(file);
//readArticleFromHtmlForm(file);
dataFile = filerequest.result;
//alert(dataFile);
filerequest = storage.get('wikipedia_small_2010-08-14/titles.idx');
filerequest.onsuccess = function() {
titleFile = filerequest.result;
//alert(titleFile);
//readArticleFromHtmlForm(file);
};
filerequest.onerror = function() {
alert("error reading title file");
};
};
filerequest.onerror = function() {
alert("error reading file");
alert("error reading data file");
};
}
function readArticleFromHtmlForm(file) {
if (file) {
function updateOffsetsFromTitle(selectValue) {
var offsets=selectValue.split(/\|/);
document.getElementById("blockstart").value=offsets[1];
document.getElementById("blockoffset").value=offsets[2];
document.getElementById("length").value=offsets[3];
}
function readIntegerFrom4Bytes(byteArray,firstIndex) {
return byteArray[firstIndex] + byteArray[firstIndex+1]*256 + byteArray[firstIndex+2]*65536 + byteArray[firstIndex+3]*16777216;
}
function readAllTitlesFromIndex(titleFile) {
if (titleFile) {
var reader = new FileReader();
reader.onerror = errorHandler;
reader.onabort = function(e) {
alert('Title file read cancelled');
};
reader.onload = function(e) {
var binaryTitleFile = e.target.result;
var byteArray = new Uint8Array(binaryTitleFile);
var i = 0;
var titleNumber=0;
var comboTitleList = document.getElementById('titleList');
while (i<byteArray.length) {
var filenumber = 0;
var blockstart = 0;
var blockoffset = 0;
var length = 0;
var title = "";
// TODO : interpret escape area
var escape1 = byteArray[i];
var escape2 = byteArray[i+1];
filenumber = byteArray[i+2];
blockstart = readIntegerFrom4Bytes(byteArray,i+3);
blockoffset = readIntegerFrom4Bytes(byteArray,i+7);
length = readIntegerFrom4Bytes(byteArray,i+11);
var newLineIndex = i+15;
/*
var buf = new ArrayBuffer();
var bufView = new Uint16Array(buf);
var j=0;
while (byteArray[newLineIndex]!=128) {
bufView[j] = byteArray[newLineIndex];
j++
newLineIndex++;
}
title = String.fromCharCode(bufView);
*/
while (newLineIndex<byteArray.length && byteArray[newLineIndex]!=128) {
newLineIndex++;
}
for (var j=i+15;j<newLineIndex;j++) {
title += String.fromCharCode(byteArray[j]);
}
comboTitleList.options[titleNumber] = new Option (title, filenumber+"|"+blockstart+"|"+blockoffset+"|"+length);
titleNumber++;
i=newLineIndex-1;
}
};
var blob = titleFile;
// Read in the image file as a binary string.
reader.readAsArrayBuffer(blob);
}
else {
alert('Title file not set');
}
}
function readArticleFromHtmlForm(dataFile) {
if (dataFile) {
var blockstart = document.getElementById('blockstart').value;
var blockoffset = document.getElementById('blockoffset').value;
var length = document.getElementById('length').value;
readArticleFromOffset(file, blockstart, blockoffset, length);
readArticleFromOffset(dataFile, blockstart, blockoffset, length);
}
else {
alert("File not set");
alert("Data file not set");
}
}
function readArticleFromOffset(file, blockstart, blockoffset, length) {
function readArticleFromOffset(dataFile, blockstart, blockoffset, length) {
var reader = new FileReader();
reader.onerror = errorHandler;
reader.onabort = function(e) {
alert('File read cancelled');
alert('Data file read cancelled');
};
reader.onload = function(e) {
var compressedArticles = e.target.result;
@ -62,7 +147,7 @@ function readArticleFromOffset(file, blockstart, blockoffset, length) {
//var blob = file;
// TODO : should be improved by reading the file chunks by chunks until the article is found,
// instead of reading the whole file starting at blockstart
var blob = file.slice(blockstart);
var blob = dataFile.slice(blockstart);
// Read in the image file as a binary string.
reader.readAsArrayBuffer(blob);
@ -83,6 +168,10 @@ function errorHandler(evt) {
};
}
function handleFileSelect(evt) {
file = evt.target.files[0];
function handleDataFileSelect(evt) {
dataFile = evt.target.files[0];
}
function handleTitleFileSelect(evt) {
titleFile = evt.target.files[0];
}