+ new trunk zimlib

This commit is contained in:
kelson42 2010-01-08 23:03:06 +00:00
parent 189d97b220
commit 3d44961ead
35 changed files with 1299 additions and 444 deletions

View File

@ -80,7 +80,7 @@ static int accessHandlerCallback(void *cls,
/* Load the article from the ZIM file */
cout << "Loading '" << title << "' in namespace '" << ns << "'" << endl;
try {
std::pair<bool, zim::File::const_iterator> resultPair = zimFileHandler->findx(ns[0], zim::QUnicodeString(title));
std::pair<bool, zim::File::const_iterator> resultPair = zimFileHandler->findx(ns[0], title);
/* Test if the article was found */
if (resultPair.first == true) {

View File

@ -23,7 +23,6 @@
#include <string>
#include <zim/zim.h>
#include <zim/dirent.h>
#include <zim/qunicode.h>
#include <zim/file.h>
#include <limits>
#include <iosfwd>
@ -50,11 +49,13 @@ namespace zim
std::string getParameter() const { return getDirent().getParameter(); }
QUnicodeString getTitle() const { return getDirent().getTitle(); }
std::string getTitle() const { return getDirent().getTitle(); }
std::string getUrl() const { return getDirent().getUrl(); }
std::string getLongUrl() const { return getDirent().getLongUrl(); }
MimeType getLibraryMimeType() const { return getDirent().getMimeType(); }
uint16_t getLibraryMimeType() const { return getDirent().getMimeType(); }
const std::string&
getMimeType() const;
getMimeType() const { return file.getMimeType(getLibraryMimeType()); }
bool isRedirect() const { return getDirent().isRedirect(); }
@ -67,8 +68,8 @@ namespace zim
bool operator< (const Article& a) const
{ return getNamespace() < a.getNamespace()
|| getNamespace() == a.getNamespace()
&& getTitle() < a.getTitle(); }
|| (getNamespace() == a.getNamespace()
&& getTitle() < a.getTitle()); }
Cluster getCluster() const
{ return file.getCluster(getDirent().getClusterNumber()); }
@ -87,8 +88,6 @@ namespace zim
File& getFile() { return file; }
size_type getIndex() const { return idx; }
QUnicodeString getUrl() const { return getDirent().getUrl(); }
bool good() const { return idx != std::numeric_limits<size_type>::max(); }
};

View File

@ -42,9 +42,9 @@ namespace zim
{ }
Blob(ClusterImpl* cluster, const char* data, unsigned size)
: _cluster(cluster),
_data(data),
_size(size)
: _data(data),
_size(size),
_cluster(cluster)
{ }
const char* data() const { return _data; }

View File

@ -22,14 +22,16 @@
#include <string>
#include <zim/zim.h>
#include <zim/qunicode.h>
#include <limits>
namespace zim
{
class Dirent
{
bool redirect;
MimeType mimeType;
uint16_t mimeType;
size_type version;
size_type clusterNumber; // only used when redirect is false
size_type blobNumber; // only used when redirect is false
@ -37,14 +39,18 @@ namespace zim
size_type redirectIndex; // only used when redirect is true
char ns;
QUnicodeString title;
std::string title;
std::string url;
std::string parameter;
public:
Dirent() {}
bool isRedirect() const { return redirect; }
MimeType getMimeType() const { return mimeType; }
uint16_t getMimeType() const { return mimeType; }
size_type getVersion() const { return version; }
void setVersion(size_type v) { version = v; }
size_type getClusterNumber() const { return isRedirect() ? 0 : clusterNumber; }
size_type getBlobNumber() const { return isRedirect() ? 0 : blobNumber; }
@ -54,26 +60,28 @@ namespace zim
size_type getRedirectIndex() const { return isRedirect() ? redirectIndex : 0; }
char getNamespace() const { return ns; }
const QUnicodeString& getTitle() const { return title; }
const std::string& getTitle() const { return title.empty() ? url : title; }
const std::string& getUrl() const { return url; }
std::string getLongUrl() const;
const std::string& getParameter() const { return parameter; }
uint16_t getExtraLen() const
{
uint16_t s = title.getValue().size();
if (!parameter.empty())
s += (parameter.size() + 1);
return s;
}
unsigned getDirentSize() const
{
return (isRedirect() ? 10 : 14) + getExtraLen();
unsigned ret = (isRedirect() ? 12 : 16) + url.size() + parameter.size() + 2;
if (title != url)
ret += title.size();
return ret;
}
void setTitle(char ns_, const QUnicodeString& title_)
void setTitle(const std::string& title_)
{
title = title_;
}
void setUrl(char ns_, const std::string& url_)
{
ns = ns_;
title = title_;
url = url_;
}
void setParameter(const std::string& parameter_)
@ -85,12 +93,12 @@ namespace zim
{
redirect = true;
redirectIndex = idx;
mimeType = zimMimeNone;
mimeType = std::numeric_limits<uint16_t>::max();
clusterNumber = 0;
blobNumber = 0;
}
void setArticle(MimeType mimeType_, size_type clusterNumber_, size_type blobNumber_)
void setArticle(uint16_t mimeType_, size_type clusterNumber_, size_type blobNumber_)
{
redirect = false;
mimeType = mimeType_;
@ -98,7 +106,6 @@ namespace zim
blobNumber = blobNumber_;
}
QUnicodeString getUrl() const;
};
std::ostream& operator<< (std::ostream& out, const Dirent& fh);

View File

@ -45,11 +45,14 @@ namespace zim
const std::string& getFilename() const { return impl->getFilename(); }
const Fileheader& getFileheader() const { return impl->getFileheader(); }
Dirent getDirent(size_type idx);
Dirent getDirent(size_type idx) { return impl->getDirent(idx); }
Dirent getDirentByTitle(size_type idx) { return impl->getDirentByTitle(idx); }
size_type getCountArticles() const { return impl->getCountArticles(); }
Article getArticle(size_type idx) const;
Article getArticle(char ns, const QUnicodeString& title, bool collate = false);
Article getArticle(char ns, const std::string& url);
Article getArticleByTitle(size_type idx);
Article getArticleByTitle(char ns, const std::string& title);
Cluster getCluster(size_type idx) const { return impl->getCluster(idx); }
size_type getCountClusters() const { return impl->getCountClusters(); }
@ -72,12 +75,17 @@ namespace zim
class const_iterator;
const_iterator begin();
const_iterator beginByTitle();
const_iterator end();
std::pair<bool, const_iterator> findx(char ns, const QUnicodeString& title, bool collate = false);
const_iterator find(char ns, const QUnicodeString& title, bool collate = false);
std::pair<bool, const_iterator> findxByTitle(char ns, const std::string& title);
std::pair<bool, const_iterator> findx(char ns, const std::string& url);
const_iterator findByTitle(char ns, const std::string& title);
const_iterator find(char ns, const std::string& url);
bool good() const { return impl.getPointer() != 0; }
time_t getMTime() const { return impl->getMTime(); }
const std::string& getMimeType(uint16_t idx) const { return impl->getMimeType(idx); }
};
}

View File

@ -38,7 +38,9 @@ namespace zim
private:
Uuid uuid;
size_type articleCount;
offset_type indexPtrPos;
offset_type titleIdxPos;
offset_type urlPtrPos;
offset_type mimeListPos;
size_type blobCount;
offset_type blobPtrPos;
size_type mainPage;
@ -47,7 +49,8 @@ namespace zim
public:
Fileheader()
: articleCount(0),
indexPtrPos(0),
titleIdxPos(0),
urlPtrPos(0),
blobCount(0),
blobPtrPos(0),
mainPage(std::numeric_limits<size_type>::max()),
@ -60,22 +63,28 @@ namespace zim
size_type getArticleCount() const { return articleCount; }
void setArticleCount(size_type s) { articleCount = s; }
offset_type getIndexPtrPos() const { return indexPtrPos; }
void setIndexPtrPos(offset_type p) { indexPtrPos = p; }
offset_type getTitleIdxPos() const { return titleIdxPos; }
void setTitleIdxPos(offset_type p) { titleIdxPos = p; }
size_type getClusterCount() const { return blobCount; }
void setClusterCount(size_type s) { blobCount = s; }
offset_type getUrlPtrPos() const { return urlPtrPos; }
void setUrlPtrPos(offset_type p) { urlPtrPos = p; }
offset_type getMimeListPos() const { return mimeListPos; }
void setMimeListPos(offset_type p) { mimeListPos = p; }
size_type getClusterCount() const { return blobCount; }
void setClusterCount(size_type s) { blobCount = s; }
offset_type getClusterPtrPos() const { return blobPtrPos; }
void setClusterPtrPos(offset_type p) { blobPtrPos = p; }
bool hasMainPage() const { return mainPage != std::numeric_limits<size_type>::max(); }
size_type getMainPage() const { return mainPage; }
void setMainPage(size_type s) { mainPage = s; }
bool hasMainPage() const { return mainPage != std::numeric_limits<size_type>::max(); }
size_type getMainPage() const { return mainPage; }
void setMainPage(size_type s) { mainPage = s; }
bool hasLayoutPage() const { return layoutPage != std::numeric_limits<size_type>::max(); }
size_type getLayoutPage() const { return layoutPage; }
void setLayoutPage(size_type s) { layoutPage = s; }
bool hasLayoutPage() const { return layoutPage != std::numeric_limits<size_type>::max(); }
size_type getLayoutPage() const { return layoutPage; }
void setLayoutPage(size_type s) { layoutPage = s; }
};
std::ostream& operator<< (std::ostream& out, const Fileheader& fh);

View File

@ -26,7 +26,6 @@
#include <map>
#include <zim/refcounted.h>
#include <zim/zim.h>
#include <zim/qunicode.h>
#include <zim/fileheader.h>
#include <zim/cache.h>
#include <zim/dirent.h>
@ -40,10 +39,6 @@ namespace zim
Fileheader header;
std::string filename;
typedef std::vector<offset_type> OffsetsType;
OffsetsType indexOffsets;
OffsetsType clusterOffsets;
Cache<size_type, Dirent> direntCache;
Cache<offset_type, Cluster> clusterCache;
typedef std::map<char, size_type> NamespaceCache;
@ -53,6 +48,11 @@ namespace zim
std::string namespaces;
time_t mtime;
typedef std::vector<std::string> MimeTypes;
MimeTypes mimeTypes;
offset_type getOffset(offset_type ptrOffset, size_type idx);
public:
explicit FileImpl(const char* fname);
@ -62,11 +62,13 @@ namespace zim
const Fileheader& getFileheader() const { return header; }
Dirent getDirent(size_type idx);
size_type getCountArticles() const { return indexOffsets.size(); }
Dirent getDirentByTitle(size_type idx);
size_type getIndexByTitle(size_type idx);
size_type getCountArticles() const { return header.getArticleCount(); }
Cluster getCluster(size_type idx);
size_type getCountClusters() const { return clusterOffsets.size(); }
offset_type getClusterOffset(size_type idx) const { return clusterOffsets[idx]; }
size_type getCountClusters() const { return header.getClusterCount(); }
offset_type getClusterOffset(size_type idx) { return getOffset(header.getClusterPtrPos(), idx); }
size_type getNamespaceBeginOffset(char ch);
size_type getNamespaceEndOffset(char ch);
@ -76,6 +78,7 @@ namespace zim
std::string getNamespaces();
bool hasNamespace(char ch);
const std::string& getMimeType(uint16_t idx) const;
};
}

View File

@ -27,30 +27,40 @@ namespace zim
{
class File::const_iterator : public std::iterator<std::bidirectional_iterator_tag, Article>
{
public:
enum Mode {
UrlIterator,
ArticleIterator
};
private:
File* file;
size_type idx;
mutable Article article;
Mode mode;
bool is_end() const { return file == 0 || idx >= file->getCountArticles(); }
public:
explicit const_iterator(File* file_ = 0, size_type idx_ = 0)
explicit const_iterator(File* file_ = 0, size_type idx_ = 0, Mode mode_ = UrlIterator)
: file(file_),
idx(idx_)
idx(idx_),
mode(mode_)
{ }
size_type getIndex() const { return idx; }
const File& getFile() const { return *file; }
bool operator== (const const_iterator& it) const
{ return is_end() && it.is_end()
|| file == it.file && idx == it.idx; }
{ return (is_end() && it.is_end())
|| (file == it.file && idx == it.idx); }
bool operator!= (const const_iterator& it) const
{ return !operator==(it); }
const_iterator& operator++()
{
++idx;
article = Article();
return *this;
}
@ -64,6 +74,7 @@ namespace zim
const_iterator& operator--()
{
--idx;
article = Article();
return *this;
}
@ -74,17 +85,17 @@ namespace zim
return *this;
}
Article operator*() const
const Article& operator*() const
{
if (article.getIndex() != idx)
article = file->getArticle(idx);
if (!article.good())
article = mode == UrlIterator ? file->getArticle(idx)
: file->getArticleByTitle(idx);
return article;
}
pointer operator->() const
{
if (article.getIndex() != idx)
article = file->getArticle(idx);
operator*();
return &article;
}

View File

@ -0,0 +1,94 @@
/*
* Copyright (C) 2009 Tommi Maekitalo
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
* NON-INFRINGEMENT. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#ifndef ZIM_LZMASTREAM_H
#define ZIM_LZMASTREAM_H
#include <iostream>
#include <stdexcept>
#include <lzma.h>
#include <vector>
namespace zim
{
class LzmaError : public std::runtime_error
{
lzma_ret ret;
public:
LzmaError(lzma_ret ret_, const std::string& msg)
: std::runtime_error(msg),
ret(ret_)
{ }
lzma_ret getRetcode() const { return ret; }
};
class LzmaStreamBuf : public std::streambuf
{
lzma_stream stream;
std::vector<char_type> obuffer;
std::streambuf* sink;
public:
LzmaStreamBuf(std::streambuf* sink_,
uint32_t preset = 3 | LZMA_PRESET_EXTREME,
lzma_check check = LZMA_CHECK_CRC32 /* LZMA_CHECK_NONE */,
unsigned bufsize = 8192);
~LzmaStreamBuf();
/// see std::streambuf
int_type overflow(int_type c);
/// see std::streambuf
int_type underflow();
/// see std::streambuf
int sync();
/// end stream
int end();
void setSink(std::streambuf* sink_) { sink = sink_; }
};
class LzmaStream : public std::ostream
{
LzmaStreamBuf streambuf;
public:
explicit LzmaStream(std::streambuf* sink,
uint32_t preset = 3 | LZMA_PRESET_EXTREME,
lzma_check check = LZMA_CHECK_CRC32 /* LZMA_CHECK_NONE */,
unsigned bufsize = 8192)
: std::ostream(0),
streambuf(sink, preset, check, bufsize)
{ init(&streambuf); }
explicit LzmaStream(std::ostream& sink,
uint32_t preset = 3 | LZMA_PRESET_EXTREME,
lzma_check check = LZMA_CHECK_CRC32 /* LZMA_CHECK_NONE */,
unsigned bufsize = 8192)
: std::ostream(0),
streambuf(sink.rdbuf(), preset, check, bufsize)
{ init(&streambuf); }
void end();
void setSink(std::streambuf* sink) { streambuf.setSink(sink); }
void setSink(std::ostream& sink) { streambuf.setSink(sink.rdbuf()); }
};
}
#endif // ZIM_LZMASTREAM_H

View File

@ -93,8 +93,8 @@ namespace zim
{ }
void search(Results& results, const std::string& expr);
void find(Results& results, char ns, const QUnicodeString& praefix, unsigned limit = searchLimit);
void find(Results& results, char ns, const QUnicodeString& begin, const QUnicodeString& end, unsigned limit = searchLimit);
void find(Results& results, char ns, const std::string& praefix, unsigned limit = searchLimit);
void find(Results& results, char ns, const std::string& begin, const std::string& end, unsigned limit = searchLimit);
static double getWeightOcc() { return weightOcc; }
static double getWeightOccOff() { return weightOccOff; }

View File

@ -31,25 +31,6 @@
namespace zim
{
template <typename objectType>
class InternalRefCounted
{
protected:
bool unlink(objectType* object)
{
if (object)
object->release();
return false;
}
void link(const InternalRefCounted& ptr, objectType* object)
{
if (object)
object->addRef();
}
};
template <typename objectType>
class SmartPtr
{

View File

@ -32,7 +32,7 @@ namespace zim
public:
virtual void onData(const std::string& data) = 0;
virtual void onToken(const std::string& token) = 0;
virtual void onLink(char ns, const std::string& title) = 0;
virtual void onLink(char ns, const std::string& url) = 0;
};
private:

View File

@ -0,0 +1,91 @@
/*
* Copyright (C) 2009 Tommi Maekitalo
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#ifndef ZIM_UNLZMASTREAM_H
#define ZIM_UNLZMASTREAM_H
#include <iostream>
#include <stdexcept>
#include <lzma.h>
namespace zim
{
class UnlzmaError : public std::runtime_error
{
lzma_ret ret;
public:
UnlzmaError(lzma_ret ret_, const std::string& msg)
: std::runtime_error(msg),
ret(ret_)
{ }
lzma_ret getRetcode() const { return ret; }
};
class UnlzmaStreamBuf : public std::streambuf
{
lzma_stream stream;
char_type* iobuffer;
unsigned bufsize;
std::streambuf* sinksource;
char_type* ibuffer() { return iobuffer; }
std::streamsize ibuffer_size() { return bufsize >> 1; }
char_type* obuffer() { return iobuffer + ibuffer_size(); }
std::streamsize obuffer_size() { return bufsize >> 1; }
public:
explicit UnlzmaStreamBuf(std::streambuf* sinksource_, unsigned bufsize = 8192);
~UnlzmaStreamBuf();
/// see std::streambuf
int_type overflow(int_type c);
/// see std::streambuf
int_type underflow();
/// see std::streambuf
int sync();
void setSinksource(std::streambuf* sinksource_) { sinksource = sinksource_; }
};
class UnlzmaStream : public std::iostream
{
UnlzmaStreamBuf streambuf;
public:
explicit UnlzmaStream(std::streambuf* sinksource, unsigned bufsize = 8192)
: std::iostream(0),
streambuf(sinksource, bufsize)
{ init(&streambuf); }
explicit UnlzmaStream(std::ios& sinksource, unsigned bufsize = 8192)
: std::iostream(0),
streambuf(sinksource.rdbuf(), bufsize)
{ init(&streambuf); }
void setSinksource(std::streambuf* sinksource) { streambuf.setSinksource(sinksource); }
void setSinksource(std::ios& sinksource) { streambuf.setSinksource(sinksource.rdbuf()); }
void setSink(std::ostream& sink) { streambuf.setSinksource(sink.rdbuf()); }
void setSource(std::istream& source) { streambuf.setSinksource(source.rdbuf()); }
};
}
#endif // ZIM_UNLZMASTREAM_H

View File

@ -42,23 +42,7 @@ namespace zim
zimcompLzma
};
enum MimeType
{
zimMimeNone = -1,
zimMimeTextHtml,
zimMimeTextPlain,
zimMimeImageJpeg,
zimMimeImagePng,
zimMimeImageTiff,
zimMimeTextCss,
zimMimeImageGif,
zimMimeIndex,
zimMimeApplicationJavaScript,
zimMimeImageIcon,
zimMimeTextXml,
zimMimeTextHtmlTemplate
};
static const char MimeHtmlTemplate[] = "text/x-zim-htmltemplate";
}
#endif // ZIM_ZIM_H

View File

@ -24,32 +24,74 @@
#include <iostream>
#include <zim/zim.h>
/*
ZInt implements a int compressor and decompressor. The algorithm compresses
small values into fewer bytes.
The idea is to add information about used bytes in the first byte. The number
of additional bytes used is specified by the number of set bits counted from
the most significant bit. So the numbers 0-127 are encoded as is, since they
fit into the 7 low order bits and the high order bit specifies, that no
additional bytes are used. The number starting from 128 up to 16383 need more
than 7 bits, so we need to set the highest order bit to 1 and the next bit to
0, leaving 6 bits of actual data, which is used as the low order bits of the
number.
Since the numbers 0-127 are already encoded in one byte, the 127 is
substracted from the actual number, so a 2 byte zero is actually a 128.
The same logic continues on the 3rd, 4th, ... byte. Up to 7 additional bytes
are used, so the first byte must contain at least one 0.
binary range
------------------------------- --------------------------------------------------
0xxx xxxx 0 - 127
10xx xxxx xxxx xxxx 128 - (2^14+128-1 = 16511)
110x xxxx xxxx xxxx xxxx xxxx 16512 - (2^21+16512-1 = 2113663)
1110 xxxx xxxx xxxx xxxx xxxx xxxx xxxx
2113664 - (2^28+2113664-1 = 270549119)
...
*/
namespace zim
{
class IZIntStream
class ZIntStream
{
std::istream& stream;
std::istream* _istream;
std::ostream* _ostream;
public:
explicit IZIntStream(std::istream& stream_)
: stream(stream_)
{ }
/// prepare ZIntStream for compression or decompression
explicit ZIntStream(std::iostream& iostream)
: _istream(&iostream),
_ostream(&iostream)
{ }
IZIntStream& get(size_type &value);
operator void*() const { return stream; }
};
/// prepare ZIntStream for decompression
explicit ZIntStream(std::istream& istream)
: _istream(&istream),
_ostream(0)
{ }
class OZIntStream
{
std::ostream& stream;
/// prepare ZIntStream for compression
explicit ZIntStream(std::ostream& ostream)
: _istream(0),
_ostream(&ostream)
{ }
public:
explicit OZIntStream(std::ostream& stream_)
: stream(stream_)
{ }
/// decompresses one value from input stream and returns it
size_type get();
OZIntStream& put(size_type value);
operator void*() const { return stream; }
ZIntStream& get(size_type &value)
{ value = get(); return *this; }
/// compresses one value to output stream
ZIntStream& put(size_type value);
operator bool() const
{ return (_istream == 0 || *_istream)
&& (_ostream == 0 || *_ostream); }
};
}

View File

@ -28,50 +28,6 @@ log_define("zim.article")
namespace zim
{
const std::string& Article::getMimeType() const
{
static const std::string textHtml = "text/html; charset=UTF-8";
static const std::string textPlain = "text/plain";
static const std::string textXml = "application/xml";
static const std::string imageJpeg = "image/jpeg";
static const std::string imagePng = "image/png";
static const std::string imageTiff = "image/tiff";
static const std::string textCss = "text/css";
static const std::string imageGif = "image/gif";
static const std::string index = "text/plain";
static const std::string applicationJavaScript = "application/x-javascript";
static const std::string imageIcon = "image/x-icon";
switch (getLibraryMimeType())
{
case zimMimeTextHtml:
case zimMimeTextHtmlTemplate:
return textHtml;
case zimMimeTextPlain:
return textPlain;
case zimMimeImageJpeg:
return imageJpeg;
case zimMimeImagePng:
return imagePng;
case zimMimeImageTiff:
return imageTiff;
case zimMimeTextCss:
return textCss;
case zimMimeImageGif:
return imageGif;
case zimMimeIndex:
return index;
case zimMimeApplicationJavaScript:
return applicationJavaScript;
case zimMimeImageIcon:
return imageIcon;
case zimMimeTextXml:
return textXml;
}
return textHtml;
}
size_type Article::getArticleSize() const
{
Dirent dirent = getDirent();
@ -108,9 +64,9 @@ namespace zim
log_trace("onToken(\"" << token << "\")");
if (token == "title")
out << article.getTitle().toUtf8();
out << article.getTitle();
else if (token == "url")
out << article.getUrl().toUtf8();
out << article.getUrl();
else if (token == "namespace")
out << article.getNamespace();
else if (token == "content")
@ -126,11 +82,11 @@ namespace zim
}
}
void Ev::onLink(char ns, const std::string& title)
void Ev::onLink(char ns, const std::string& url)
{
if (maxRecurse <= 0)
throw std::runtime_error("maximum recursive limit is reached");
article.getFile().getArticle(ns, QUnicodeString::fromUtf8(title)).getPage(out, false, maxRecurse - 1);
article.getFile().getArticle(ns, url).getPage(out, false, maxRecurse - 1);
}
}
@ -146,7 +102,7 @@ namespace zim
{
log_trace("Article::getPage(" << layout << ", " << maxRecurse << ')');
if (getLibraryMimeType() == zimMimeTextHtml || getLibraryMimeType() == zimMimeTextHtmlTemplate)
if (getMimeType().compare(0, 9, "text/html") == 0 || getMimeType() == MimeHtmlTemplate)
{
if (layout && file.getFileheader().hasLayoutPage())
{
@ -162,7 +118,7 @@ namespace zim
return;
}
else if (getLibraryMimeType() == zimMimeTextHtmlTemplate)
else if (getMimeType() == MimeHtmlTemplate)
{
Blob data = getData();

View File

@ -43,7 +43,7 @@ namespace zim
for (File::const_iterator it = articleFile.begin(); it != articleFile.end(); ++it)
{
std::string title = it->getTitle().toUtf8();
std::string title = it->getTitle();
if (title.find(expr) != std::string::npos)
ret.push_back(*it);
}

View File

@ -66,8 +66,6 @@ namespace zim
Bunzip2StreamBuf::int_type Bunzip2StreamBuf::overflow(int_type c)
{
log_debug("Bunzip2StreamBuf::overflow");
if (pptr())
{
// initialize input-stream for
@ -81,10 +79,8 @@ namespace zim
stream.next_out = ibuffer();
stream.avail_out = ibuffer_size();
log_debug("pre:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in);
ret = ::BZ2_bzDecompress(&stream);
checkError(ret, stream);
log_debug("post:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in << " ret=" << ret);
// copy ibuffer to sinksource
std::streamsize count = ibuffer_size() - stream.avail_out;
@ -118,14 +114,12 @@ namespace zim
{
// there is data already available
// read compressed data from source into ibuffer
log_debug("in_avail=" << sinksource->in_avail());
stream.avail_in = sinksource->sgetn(ibuffer(), mymin(sinksource->in_avail(), ibuffer_size()));
}
else
{
// no data available
stream.avail_in = sinksource->sgetn(ibuffer(), ibuffer_size());
log_debug(stream.avail_in << " bytes read from source");
if (stream.avail_in == 0)
return traits_type::eof();
}
@ -137,9 +131,7 @@ namespace zim
// at least one character received from source - pass to decompressor
log_debug("pre:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in);
int ret = ::BZ2_bzDecompress(&stream);
log_debug("post:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in << " ret=" << ret);
checkError(ret, stream);

View File

@ -56,14 +56,11 @@ namespace zim
Bzip2StreamBuf::~Bzip2StreamBuf()
{
log_debug("bzCompressEnd");
::BZ2_bzCompressEnd(&stream);
}
Bzip2StreamBuf::int_type Bzip2StreamBuf::overflow(int_type c)
{
log_debug("Bzip2StreamBuf::overflow");
// initialize input-stream
stream.next_in = &obuffer[0];
stream.avail_in = pptr() - &obuffer[0];
@ -74,9 +71,7 @@ namespace zim
stream.avail_out = sizeof(zbuffer);
// deflate
log_debug("pre:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in << " BZ_RUN");
int ret = checkError(::BZ2_bzCompress(&stream, BZ_RUN), stream);
log_debug("post:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in << " ret=" << ret << " total_out_lo32=" << stream.total_out_lo32);
checkError(::BZ2_bzCompress(&stream, BZ_RUN), stream);
// copy zbuffer to sink / consume deflated data
std::streamsize count = sizeof(zbuffer) - stream.avail_out;
@ -106,8 +101,6 @@ namespace zim
int Bzip2StreamBuf::sync()
{
log_debug("Bzip2StreamBuf::sync");
// initialize input-stream for
stream.next_in = &obuffer[0];
stream.avail_in = pptr() - pbase();
@ -119,9 +112,7 @@ namespace zim
stream.next_out = zbuffer;
stream.avail_out = sizeof(zbuffer);
log_debug("pre:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in << " BZ_FLUSH");
ret = checkError(::BZ2_bzCompress(&stream, BZ_FLUSH), stream);
log_debug("post:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in << " ret=" << ret << " total_out_lo32=" << stream.total_out_lo32);
// copy zbuffer to sink
std::streamsize count = sizeof(zbuffer) - stream.avail_out;
@ -141,8 +132,6 @@ namespace zim
int Bzip2StreamBuf::end()
{
log_debug("Bzip2StreamBuf::end");
char zbuffer[8192];
// initialize input-stream for
stream.next_in = &obuffer[0];
@ -154,9 +143,7 @@ namespace zim
stream.next_out = zbuffer;
stream.avail_out = sizeof(zbuffer);
log_debug("pre:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in << " BZ_FINISH");
ret = checkError(::BZ2_bzCompress(&stream, BZ_FINISH), stream);
log_debug("post:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in << " ret=" << ret << " total_out_lo32=" << stream.total_out_lo32);
// copy zbuffer to sink
std::streamsize count = sizeof(zbuffer) - stream.avail_out;

View File

@ -19,16 +19,33 @@
#include <zim/cluster.h>
#include <zim/blob.h>
#include <zim/endian.h>
#include <stdlib.h>
#include <sstream>
#include "log.h"
#include "config.h"
#ifdef ENABLE_ZLIB
#include <zim/deflatestream.h>
#include <zim/inflatestream.h>
#endif
#ifdef ENABLE_BZIP2
#include <zim/bzip2stream.h>
#include <zim/bunzip2stream.h>
#include <zim/endian.h>
#endif
#ifdef ENABLE_LZMA
#include <zim/lzmastream.h>
#include <zim/unlzmastream.h>
#endif
log_define("zim.cluster")
#define log_debug1(e)
namespace zim
{
Cluster::Cluster()
@ -50,7 +67,7 @@ namespace zim
void ClusterImpl::read(std::istream& in)
{
log_debug("read");
log_debug1("read");
// read first offset, which specifies, how many offsets we need to read
size_type offset;
@ -63,7 +80,7 @@ namespace zim
size_type n = offset / 4;
size_type a = offset;
log_debug("first offset is " << offset << " n=" << n << " a=" << a);
log_debug1("first offset is " << offset << " n=" << n << " a=" << a);
// read offsets
offsets.clear();
@ -75,11 +92,11 @@ namespace zim
in.read(reinterpret_cast<char*>(&offset), sizeof(offset));
if (in.fail())
{
log_debug("fail at " << n);
log_debug1("fail at " << n);
return;
}
offset = fromLittleEndian(&offset);
log_debug("offset=" << offset << '(' << offset-a << ')');
log_debug1("offset=" << offset << '(' << offset-a << ')');
offsets.push_back(offset - a);
}
@ -88,7 +105,7 @@ namespace zim
{
n = offsets.back() - offsets.front();
data.resize(n);
log_debug("read " << n << " bytes of data");
log_debug1("read " << n << " bytes of data");
in.read(&(data[0]), n);
}
}
@ -109,12 +126,9 @@ namespace zim
void ClusterImpl::addBlob(const Blob& blob)
{
log_debug("addBlob(ptr, " << blob.size() << ')');
log_debug1("addBlob(ptr, " << blob.size() << ')');
data.insert(data.end(), blob.data(), blob.end());
offsets.push_back(data.size());
for (unsigned n = 0; n < offsets.size(); ++n)
log_debug("offset[" << n << "]=" << offsets[n]);
}
Blob ClusterImpl::getBlob(size_type n) const
@ -141,6 +155,8 @@ namespace zim
std::istream& operator>> (std::istream& in, ClusterImpl& clusterImpl)
{
log_trace("read cluster");
char c;
in.get(c);
clusterImpl.setCompression(static_cast<CompressionType>(c));
@ -154,22 +170,42 @@ namespace zim
case zimcompZip:
{
#ifdef ENABLE_ZLIB
log_debug("uncompress data (zlib)");
zim::InflateStream is(in);
is.exceptions(std::ios::failbit | std::ios::badbit);
clusterImpl.read(is);
#else
throw std::runtime_error("zlib not enabled in this library");
#endif
break;
}
case zimcompBzip2:
{
#ifdef ENABLE_BZIP2
log_debug("uncompress data (bzip2)");
zim::Bunzip2Stream is(in);
is.exceptions(std::ios::failbit | std::ios::badbit);
clusterImpl.read(is);
#else
throw std::runtime_error("bzip2 not enabled in this library");
#endif
break;
}
case zimcompLzma:
throw std::runtime_error("lzma decompression is not implemented");
{
#ifdef ENABLE_LZMA
log_debug("uncompress data (lzma)");
zim::UnlzmaStream is(in);
is.exceptions(std::ios::failbit | std::ios::badbit);
clusterImpl.read(is);
#else
throw std::runtime_error("lzma not enabled in this library");
#endif
break;
}
default:
log_error("invalid compression flag " << c);
@ -187,6 +223,8 @@ namespace zim
std::ostream& operator<< (std::ostream& out, const ClusterImpl& clusterImpl)
{
log_trace("write cluster");
out.put(static_cast<char>(clusterImpl.getCompression()));
switch(clusterImpl.getCompression())
@ -198,24 +236,65 @@ namespace zim
case zimcompZip:
{
#ifdef ENABLE_ZLIB
log_debug("compress data (zlib)");
zim::DeflateStream os(out);
os.exceptions(std::ios::failbit | std::ios::badbit);
clusterImpl.write(os);
os.flush();
#else
throw std::runtime_error("zlib not enabled in this library");
#endif
break;
}
case zimcompBzip2:
{
#ifdef ENABLE_BZIP2
log_debug("compress data (bzip2)");
zim::Bzip2Stream os(out);
os.exceptions(std::ios::failbit | std::ios::badbit);
clusterImpl.write(os);
os.end();
#else
throw std::runtime_error("bzip2 not enabled in this library");
#endif
break;
}
case zimcompLzma:
throw std::runtime_error("lzma compression is not implemented");
{
#ifdef ENABLE_LZMA
uint32_t lzmaPreset = 3 | LZMA_PRESET_EXTREME;
/**
* read lzma preset from environment
* ZIM_LZMA_PRESET is a number followed optionally by a
* suffix 'e'. The number gives the preset and the suffix tells,
* if LZMA_PRESET_EXTREME should be set.
* e.g.:
* ZIM_LZMA_LEVEL=9 => 9
* ZIM_LZMA_LEVEL=3e => 3 + extreme
*/
const char* e = ::getenv("ZIM_LZMA_LEVEL");
if (e)
{
char flag = '\0';
std::istringstream s(e);
s >> lzmaPreset >> flag;
if (flag == 'e')
lzmaPreset |= LZMA_PRESET_EXTREME;
}
log_debug("compress data (lzma, " << std::hex << lzmaPreset << ")");
zim::LzmaStream os(out, lzmaPreset);
os.exceptions(std::ios::failbit | std::ios::badbit);
clusterImpl.write(os);
os.end();
#else
throw std::runtime_error("lzma not enabled in this library");
#endif
break;
}
default:
std::ostringstream msg;

99
src/zimlib/src/config.h Normal file
View File

@ -0,0 +1,99 @@
/* src/zimlib/src/config.h. Generated from config.h.in by configure. */
/* src/zimlib/src/config.h.in. Generated from configure.ac by autoheader. */
/* set zim cluster cache size to number of cached chunks */
#define CLUSTER_CACHE_SIZE 16
/* set zim dirent cache size to number of cached chunks */
#define DIRENT_CACHE_SIZE 51200
/* defined if bzip2 compression is enabled */
#define ENABLE_BZIP2 1
/* defined if lzma compression is enabled */
#define ENABLE_LZMA 1
/* defined if zlib compression is enabled */
#define ENABLE_ZLIB 1
/* Define to 1 if you have the <dlfcn.h> header file. */
#define HAVE_DLFCN_H 1
/* Define to 1 if you have the <inttypes.h> header file. */
#define HAVE_INTTYPES_H 1
/* Define to 1 if you have the `bz2' library (-lbz2). */
#define HAVE_LIBBZ2 1
/* Define to 1 if you have the `lzma' library (-llzma). */
#define HAVE_LIBLZMA 1
/* Define to 1 if you have the `microhttpd' library (-lmicrohttpd). */
#define HAVE_LIBMICROHTTPD 1
/* Define to 1 if you have the `unac' library (-lunac). */
#define HAVE_LIBUNAC 1
/* Define to 1 if you have the `z' library (-lz). */
#define HAVE_LIBZ 1
/* Define to 1 if you have the <memory.h> header file. */
#define HAVE_MEMORY_H 1
/* Define to 1 if you have the `stat64' function. */
#define HAVE_STAT64 1
/* Define to 1 if you have the <stdint.h> header file. */
#define HAVE_STDINT_H 1
/* Define to 1 if you have the <stdlib.h> header file. */
#define HAVE_STDLIB_H 1
/* Define to 1 if you have the <strings.h> header file. */
#define HAVE_STRINGS_H 1
/* Define to 1 if you have the <string.h> header file. */
#define HAVE_STRING_H 1
/* Define to 1 if you have the <sys/stat.h> header file. */
#define HAVE_SYS_STAT_H 1
/* Define to 1 if you have the <sys/types.h> header file. */
#define HAVE_SYS_TYPES_H 1
/* Define to 1 if you have the <unistd.h> header file. */
#define HAVE_UNISTD_H 1
/* Define to the sub-directory in which libtool stores uninstalled libraries.
*/
#define LT_OBJDIR ".libs/"
/* set lzma uncompress memory size to number of MB */
#define LZMA_MEMORY_SIZE 128
/* Name of package */
#define PACKAGE "kiwix"
/* Define to the address where bug reports for this package should be sent. */
#define PACKAGE_BUGREPORT ""
/* Define to the full name of this package. */
#define PACKAGE_NAME "kiwix"
/* Define to the full name and version of this package. */
#define PACKAGE_STRING "kiwix 0.9"
/* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "kiwix"
/* Define to the home page for this package. */
#define PACKAGE_URL ""
/* Define to the version of this package. */
#define PACKAGE_VERSION "0.9"
/* Define to 1 if you have the ANSI C header files. */
#define STDC_HEADERS 1
/* Version number of package */
#define VERSION "0.9"

View File

@ -6,6 +6,15 @@
/* set zim dirent cache size to number of cached chunks */
#undef DIRENT_CACHE_SIZE
/* defined if bzip2 compression is enabled */
#undef ENABLE_BZIP2
/* defined if lzma compression is enabled */
#undef ENABLE_LZMA
/* defined if zlib compression is enabled */
#undef ENABLE_ZLIB
/* Define to 1 if you have the <dlfcn.h> header file. */
#undef HAVE_DLFCN_H
@ -15,6 +24,9 @@
/* Define to 1 if you have the `bz2' library (-lbz2). */
#undef HAVE_LIBBZ2
/* Define to 1 if you have the `lzma' library (-llzma). */
#undef HAVE_LIBLZMA
/* Define to 1 if you have the `microhttpd' library (-lmicrohttpd). */
#undef HAVE_LIBMICROHTTPD
@ -55,6 +67,9 @@
*/
#undef LT_OBJDIR
/* set lzma uncompress memory size to number of MB */
#undef LZMA_MEMORY_SIZE
/* Name of package */
#undef PACKAGE

View File

@ -70,8 +70,6 @@ namespace zim
DeflateStreamBuf::int_type DeflateStreamBuf::overflow(int_type c)
{
log_debug("DeflateStreamBuf::overflow");
// initialize input-stream
stream.next_in = reinterpret_cast<Bytef*>(&obuffer[0]);
stream.avail_in = pptr() - &obuffer[0];
@ -82,9 +80,7 @@ namespace zim
stream.avail_out = sizeof(zbuffer);
// deflate
log_debug("pre:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in);
checkError(::deflate(&stream, Z_NO_FLUSH), stream);
log_debug("post:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in);
// copy zbuffer to sink / consume deflated data
std::streamsize count = sizeof(zbuffer) - stream.avail_out;
@ -114,8 +110,6 @@ namespace zim
int DeflateStreamBuf::sync()
{
log_debug("DeflateStreamBuf::sync");
// initialize input-stream for
stream.next_in = reinterpret_cast<Bytef*>(&obuffer[0]);
stream.avail_in = pptr() - pbase();
@ -126,9 +120,7 @@ namespace zim
stream.next_out = (Bytef*)zbuffer;
stream.avail_out = sizeof(zbuffer);
log_debug("pre:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in);
checkError(::deflate(&stream, Z_SYNC_FLUSH), stream);
log_debug("post:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in);
// copy zbuffer to sink
std::streamsize count = sizeof(zbuffer) - stream.avail_out;
@ -157,9 +149,7 @@ namespace zim
stream.next_out = (Bytef*)zbuffer;
stream.avail_out = sizeof(zbuffer);
log_debug("pre:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in);
int ret = checkError(::deflate(&stream, Z_FINISH), stream);
log_debug("post:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in);
// copy zbuffer to sink
std::streamsize count = sizeof(zbuffer) - stream.avail_out;

View File

@ -35,33 +35,35 @@ namespace zim
{
union
{
char d[12];
char d[16];
long a;
} header;
header.d[0] = static_cast<char>(dirent.isRedirect());
header.d[1] = static_cast<char>(dirent.getMimeType());
header.d[2] = '\0';
toLittleEndian(dirent.getMimeType(), header.d);
header.d[2] = static_cast<char>(dirent.getParameter().size());
header.d[3] = dirent.getNamespace();
log_debug("title=" << dirent.getTitle() << " title.size()=" << dirent.getTitle().getValue().size() << " extralen=" << dirent.getExtraLen());
log_debug("title=" << dirent.getTitle() << " title.size()=" << dirent.getTitle().size());
toLittleEndian(dirent.getVersion(), header.d + 4);
if (dirent.isRedirect())
{
toLittleEndian(dirent.getRedirectIndex(), header.d + 4);
toLittleEndian(dirent.getExtraLen(), header.d + 8);
out.write(header.d, 10);
toLittleEndian(dirent.getRedirectIndex(), header.d + 8);
out.write(header.d, 12);
}
else
{
toLittleEndian(dirent.getClusterNumber(), header.d + 4);
toLittleEndian(dirent.getBlobNumber(), header.d + 8);
toLittleEndian(dirent.getExtraLen(), header.d + 12);
out.write(header.d, 14);
toLittleEndian(dirent.getClusterNumber(), header.d + 8);
toLittleEndian(dirent.getBlobNumber(), header.d + 12);
out.write(header.d, 16);
}
out << dirent.getTitle().getValue();
if (!dirent.getParameter().empty())
out << '\0' << dirent.getParameter();
out << dirent.getUrl() << '\0';
std::string t = dirent.getTitle();
if (t != dirent.getUrl())
out << t;
out << '\0' << dirent.getParameter();
return out;
}
@ -71,34 +73,34 @@ namespace zim
union
{
long a;
char d[14];
char d[16];
} header;
in.read(header.d, 10);
in.read(header.d, 12);
if (in.fail())
{
log_warn("error reading dirent header");
return in;
}
if (in.gcount() != 10)
if (in.gcount() != 12)
{
log_warn("error reading dirent header (2)");
in.setstate(std::ios::failbit);
return in;
}
bool redirect = header.d[0];
uint16_t mimeType = fromLittleEndian(reinterpret_cast<const uint16_t*>(header.d));
bool redirect = (mimeType == std::numeric_limits<uint16_t>::max());
char ns = header.d[3];
size_type extraLen;
size_type version = fromLittleEndian(reinterpret_cast<const size_type*>(header.d + 4));
dirent.setVersion(version);
if (redirect)
{
log_debug("read redirect entry");
size_type redirectIndex = fromLittleEndian(reinterpret_cast<const size_type*>(header.d + 8));
size_type redirectIndex = fromLittleEndian(reinterpret_cast<const size_type*>(header.d + 4));
extraLen = fromLittleEndian(reinterpret_cast<const uint16_t*>(header.d + 8));
log_debug("redirectIndex=" << redirectIndex << " extraLen=" << extraLen);
log_debug("redirectIndex=" << redirectIndex);
dirent.setRedirect(redirectIndex);
}
@ -106,7 +108,7 @@ namespace zim
{
log_debug("read article entry");
in.read(header.d + 10, 4);
in.read(header.d + 12, 4);
if (in.fail())
{
log_warn("error reading article dirent header");
@ -116,56 +118,48 @@ namespace zim
if (in.gcount() != 4)
{
log_warn("error reading article dirent header (2)");
return in;
in.setstate(std::ios::failbit);
return in;
}
MimeType mimeType = static_cast<MimeType>(header.d[1]);
size_type clusterNumber = fromLittleEndian(reinterpret_cast<const size_type*>(header.d + 4));
size_type blobNumber = fromLittleEndian(reinterpret_cast<const size_type*>(header.d + 8));
extraLen = fromLittleEndian(reinterpret_cast<const uint16_t*>(header.d + 12));
size_type clusterNumber = fromLittleEndian(reinterpret_cast<const size_type*>(header.d + 8));
size_type blobNumber = fromLittleEndian(reinterpret_cast<const size_type*>(header.d + 12));
log_debug("mimeType=" << mimeType << " clusterNumber=" << clusterNumber << " blobNumber=" << blobNumber << " extraLen=" << extraLen);
log_debug("mimeType=" << mimeType << " clusterNumber=" << clusterNumber << " blobNumber=" << blobNumber);
dirent.setArticle(mimeType, clusterNumber, blobNumber);
}
char ch;
std::string url;
std::string title;
std::string parameter;
log_debug("read title and parameters; extraLen=" << extraLen);
log_debug("read url, title and parameters");
title.reserve(extraLen);
while (extraLen && in.get(ch) && ch != '\0')
{
while (in.get(ch) && ch != '\0')
url += ch;
while (in.get(ch) && ch != '\0')
title += ch;
--extraLen;
}
if (in && extraLen)
{
--extraLen;
parameter.reserve(extraLen);
while (extraLen-- && in.get(ch))
parameter += ch;
}
uint8_t extraLen = static_cast<uint8_t>(header.d[2]);
while (extraLen-- > 0 && in.get(ch))
parameter += ch;
dirent.setTitle(ns, QUnicodeString(title));
dirent.setUrl(ns, url);
dirent.setTitle(title);
dirent.setParameter(parameter);
return in;
}
QUnicodeString Dirent::getUrl() const
std::string Dirent::getLongUrl() const
{
log_trace("Dirent::getUrl()");
log_trace("Dirent::getLongUrl()");
log_debug("namespace=" << getNamespace() << " title=" << getTitle());
log_debug("namespace=" << getNamespace());
log_debug("title=" << getTitle());
return QUnicodeString(std::string(1, getNamespace()) + '/' + getTitle().getValue());
return std::string(1, getNamespace()) + '/' + getUrl();
}
}

View File

@ -0,0 +1,58 @@
/*
* Copyright (C) 2009 Tommi Maekitalo
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
* NON-INFRINGEMENT. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include <sstream>
#include <stdlib.h>
namespace zim
{
unsigned envValue(const char* env, unsigned def)
{
const char* v = ::getenv(env);
if (v)
{
std::istringstream s(v);
s >> def;
}
return def;
}
unsigned envMemSize(const char* env, unsigned def)
{
const char* v = ::getenv(env);
if (v)
{
char unit = '\0';
std::istringstream s(v);
s >> def >> unit;
switch (unit)
{
case 'k':
case 'K': def *= 1024; break;
case 'm':
case 'M': def *= 1024 * 1024; break;
case 'g':
case 'G': def *= 1024 * 1024 * 1024; break;
}
}
return def;
}
}

29
src/zimlib/src/envvalue.h Normal file
View File

@ -0,0 +1,29 @@
/*
* Copyright (C) 2009 Tommi Maekitalo
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
* NON-INFRINGEMENT. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#ifndef ZIM_ENVVALUE_H
#define ZIM_ENVVALUE_H
namespace zim
{
unsigned envValue(const char* env, unsigned def);
unsigned envMemSize(const char* env, unsigned def);
}
#endif // ZIM_ENVVALUE_H

View File

@ -26,22 +26,27 @@ log_define("zim.file")
namespace zim
{
Dirent File::getDirent(size_type idx)
{
log_trace("File::getDirent(" << idx << ')');
return impl->getDirent(idx);
}
Article File::getArticle(size_type idx) const
{
return Article(*this, idx);
}
Article File::getArticle(char ns, const QUnicodeString& title, bool collate)
Article File::getArticle(char ns, const std::string& url)
{
log_trace("File::getArticle('" << ns << "', \"" << title << "\", " << collate << ')');
std::pair<bool, const_iterator> r = findx(ns, title, collate);
log_trace("File::getArticle('" << ns << "', \"" << url << ')');
std::pair<bool, const_iterator> r = findx(ns, url);
return r.first ? *r.second : Article();
}
Article File::getArticleByTitle(size_type idx)
{
return Article(*this, impl->getIndexByTitle(idx));
}
Article File::getArticleByTitle(char ns, const std::string& title)
{
log_trace("File::getArticleByTitle('" << ns << "', \"" << title << ')');
std::pair<bool, const_iterator> r = findxByTitle(ns, title);
return r.first ? *r.second : Article();
}
@ -54,12 +59,15 @@ namespace zim
File::const_iterator File::begin()
{ return const_iterator(this, 0); }
File::const_iterator File::beginByTitle()
{ return const_iterator(this, 0, const_iterator::ArticleIterator); }
File::const_iterator File::end()
{ return const_iterator(this, getCountArticles()); }
std::pair<bool, File::const_iterator> File::findx(char ns, const QUnicodeString& title, bool collate)
std::pair<bool, File::const_iterator> File::findx(char ns, const std::string& url)
{
log_debug("find article " << ns << " \"" << title << "\", " << collate << " in file \"" << getFilename() << '"');
log_debug("find article by url " << ns << " \"" << url << "\", in file \"" << getFilename() << '"');
size_type l = getNamespaceBeginOffset(ns);
size_type u = getNamespaceEndOffset(ns);
@ -79,8 +87,8 @@ namespace zim
int c = ns < d.getNamespace() ? -1
: ns > d.getNamespace() ? 1
: (collate ? title.compareCollate(QUnicodeString(d.getTitle()))
: title.compare(QUnicodeString(d.getTitle())));
: url.compare(d.getUrl());
if (c < 0)
u = p;
else if (c > 0)
@ -93,20 +101,70 @@ namespace zim
}
Dirent d = getDirent(l);
int c = collate ? title.compareCollate(QUnicodeString(d.getTitle()))
: title.compare(QUnicodeString(d.getTitle()));
int c = url.compare(d.getUrl());
if (c == 0)
{
log_debug("article found after " << itcount << " iterations in file \"" << getFilename() << "\" at index " << l);
return std::pair<bool, const_iterator>(true, const_iterator(this, l));
}
log_debug("article not found after " << itcount << " iterations (\"" << d.getTitle() << "\" does not match)");
log_debug("article not found after " << itcount << " iterations (\"" << d.getUrl() << "\" does not match)");
return std::pair<bool, const_iterator>(false, const_iterator(this, u));
}
File::const_iterator File::find(char ns, const QUnicodeString& title, bool collate)
std::pair<bool, File::const_iterator> File::findxByTitle(char ns, const std::string& title)
{
return findx(ns, title, collate).second;
log_debug("find article by title " << ns << " \"" << title << "\", in file \"" << getFilename() << '"');
size_type l = getNamespaceBeginOffset(ns);
size_type u = getNamespaceEndOffset(ns);
if (l == u)
{
log_debug("namespace " << ns << " not found");
return std::pair<bool, const_iterator>(false, end());
}
unsigned itcount = 0;
while (u - l > 1)
{
++itcount;
size_type p = l + (u - l) / 2;
Dirent d = getDirentByTitle(p);
int c = ns < d.getNamespace() ? -1
: ns > d.getNamespace() ? 1
: title.compare(d.getTitle());
if (c < 0)
u = p;
else if (c > 0)
l = p;
else
{
log_debug("article found after " << itcount << " iterations in file \"" << getFilename() << "\" at index " << p);
return std::pair<bool, const_iterator>(true, const_iterator(this, p, const_iterator::ArticleIterator));
}
}
Dirent d = getDirentByTitle(l);
int c = title.compare(d.getTitle());
if (c == 0)
{
log_debug("article found after " << itcount << " iterations in file \"" << getFilename() << "\" at index " << l);
return std::pair<bool, const_iterator>(true, const_iterator(this, l, const_iterator::ArticleIterator));
}
log_debug("article not found after " << itcount << " iterations (\"" << d.getTitle() << "\" does not match)");
return std::pair<bool, const_iterator>(false, const_iterator(this, u, const_iterator::ArticleIterator));
}
File::const_iterator File::find(char ns, const std::string& url)
{ return findx(ns, url).second; }
File::const_iterator File::findByTitle(char ns, const std::string& title)
{ return findxByTitle(ns, title).second; }
}

View File

@ -27,34 +27,36 @@ log_define("zim.file.header")
namespace zim
{
const size_type Fileheader::zimMagic = 0x044d495a; // ="ZIM^d"
const size_type Fileheader::zimVersion = 4;
const size_type Fileheader::size = 56;
const size_type Fileheader::zimVersion = 5;
const size_type Fileheader::size = 72;
std::ostream& operator<< (std::ostream& out, const Fileheader& fh)
{
char header[56];
char header[Fileheader::size];
toLittleEndian(Fileheader::zimMagic, header);
toLittleEndian(Fileheader::zimVersion, header + 4);
std::copy(fh.getUuid().data, fh.getUuid().data + sizeof(Uuid), header + 8);
toLittleEndian(fh.getArticleCount(), header + 24);
toLittleEndian(fh.getIndexPtrPos(), header + 28);
toLittleEndian(fh.getClusterCount(), header + 36);
toLittleEndian(fh.getClusterPtrPos(), header + 40);
toLittleEndian(fh.getMainPage(), header + 48);
toLittleEndian(fh.getLayoutPage(), header + 52);
toLittleEndian(fh.getClusterCount(), header + 28);
toLittleEndian(fh.getUrlPtrPos(), header + 32);
toLittleEndian(fh.getTitleIdxPos(), header + 40);
toLittleEndian(fh.getClusterPtrPos(), header + 48);
toLittleEndian(fh.getMimeListPos(), header + 56);
toLittleEndian(fh.getMainPage(), header + 64);
toLittleEndian(fh.getLayoutPage(), header + 68);
out.write(header, 56);
out.write(header, Fileheader::size);
return out;
}
std::istream& operator>> (std::istream& in, Fileheader& fh)
{
char header[56];
in.read(header, 56);
char header[Fileheader::size];
in.read(header, Fileheader::size);
if (in.fail())
return in;
if (in.gcount() != 56)
if (static_cast<size_type>(in.gcount()) != Fileheader::size)
{
in.setstate(std::ios::failbit);
return in;
@ -69,8 +71,8 @@ namespace zim
return in;
}
size_type version = fromLittleEndian(reinterpret_cast<const size_type*>(header + 4));
if (version != Fileheader::zimVersion)
uint16_t version = fromLittleEndian(reinterpret_cast<const uint16_t*>(header + 4));
if (version != static_cast<size_type>(Fileheader::zimVersion))
{
log_error("invalid zimfile version " << version << " found - "
<< Fileheader::zimVersion << " expected");
@ -81,17 +83,21 @@ namespace zim
Uuid uuid;
std::copy(header + 8, header + 24, uuid.data);
size_type articleCount = fromLittleEndian(reinterpret_cast<const size_type*>(header + 24));
offset_type indexPtrPos = fromLittleEndian(reinterpret_cast<const offset_type*>(header + 28));
size_type blobCount = fromLittleEndian(reinterpret_cast<const size_type*>(header + 36));
offset_type blobPtrPos = fromLittleEndian(reinterpret_cast<const offset_type*>(header + 40));
size_type mainPage = fromLittleEndian(reinterpret_cast<const size_type*>(header + 48));
size_type layoutPage = fromLittleEndian(reinterpret_cast<const size_type*>(header + 52));
size_type clusterCount = fromLittleEndian(reinterpret_cast<const size_type*>(header + 28));
offset_type urlPtrPos = fromLittleEndian(reinterpret_cast<const offset_type*>(header + 32));
offset_type titleIdxPos = fromLittleEndian(reinterpret_cast<const offset_type*>(header + 40));
offset_type clusterPtrPos = fromLittleEndian(reinterpret_cast<const offset_type*>(header + 48));
offset_type mimeListPos = fromLittleEndian(reinterpret_cast<const offset_type*>(header + 56));
size_type mainPage = fromLittleEndian(reinterpret_cast<const size_type*>(header + 64));
size_type layoutPage = fromLittleEndian(reinterpret_cast<const size_type*>(header + 68));
fh.setUuid(uuid);
fh.setArticleCount(articleCount);
fh.setIndexPtrPos(indexPtrPos);
fh.setClusterCount(blobCount);
fh.setClusterPtrPos(blobPtrPos);
fh.setClusterCount(clusterCount);
fh.setUrlPtrPos(urlPtrPos);
fh.setTitleIdxPos(titleIdxPos);
fh.setClusterPtrPos(clusterPtrPos);
fh.setMimeListPos(mimeListPos);
fh.setMainPage(mainPage);
fh.setLayoutPage(layoutPage);

View File

@ -24,11 +24,11 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <stdlib.h>
#include <sstream>
#include <errno.h>
#include "config.h"
#include "log.h"
#include "envvalue.h"
#ifdef WITH_CXXTOOLS
# include <cxxtools/systemerror.h>
@ -38,20 +38,6 @@ log_define("zim.file.impl")
namespace zim
{
namespace
{
unsigned envValue(const char* env, unsigned def)
{
const char* v = ::getenv(env);
if (v)
{
std::istringstream s(v);
s >> def;
}
return def;
}
}
//////////////////////////////////////////////////////////////////////
// FileImpl
//
@ -60,6 +46,8 @@ namespace zim
direntCache(envValue("ZIM_DIRENTCACHE", DIRENT_CACHE_SIZE)),
clusterCache(envValue("ZIM_CLUSTERCACHE", CLUSTER_CACHE_SIZE))
{
log_trace("read file \"" << fname << '"');
if (!zimFile)
throw ZimFileFormatError(std::string("can't open zim-file \"") + fname + '"');
@ -89,55 +77,41 @@ namespace zim
if (zimFile.fail())
throw ZimFileFormatError("error reading zim-file header");
// read index offsets
{
size_type indexOffsetsSize = header.getArticleCount() * sizeof(OffsetsType::value_type);
log_debug("read " << indexOffsetsSize << " bytes indexptr");
zimFile.seekg(header.getIndexPtrPos());
indexOffsets.resize(header.getArticleCount());
zimFile.read(reinterpret_cast<char*>(&indexOffsets[0]), indexOffsetsSize);
}
if (isBigEndian())
{
for (OffsetsType::iterator it = indexOffsets.begin(); it != indexOffsets.end(); ++it)
*it = fromLittleEndian(&*it);
}
// read cluster offsets
{
size_type clusterOffsetsSize = header.getClusterCount() * sizeof(OffsetsType::value_type);
log_debug("read " << clusterOffsetsSize << " bytes clusterptr");
zimFile.seekg(header.getClusterPtrPos());
clusterOffsets.resize(header.getClusterCount());
zimFile.read(reinterpret_cast<char*>(&clusterOffsets[0]), clusterOffsetsSize);
}
if (isBigEndian())
{
for (OffsetsType::iterator it = clusterOffsets.begin(); it != clusterOffsets.end(); ++it)
*it = fromLittleEndian(&*it);
}
if (clusterOffsets.empty())
if (getCountClusters() == 0)
log_warn("no clusters found");
else
{
offset_type lastOffset = clusterOffsets.back();
offset_type lastOffset = getClusterOffset(getCountClusters() - 1);
log_debug("last offset=" << lastOffset << " file size=" << st.st_size);
if (lastOffset > st.st_size)
if (lastOffset > static_cast<offset_type>(st.st_size))
{
log_fatal("last offset (" << lastOffset << ") larger than file size (" << st.st_size << ')');
throw ZimFileFormatError("last cluster offset larger than file size; file corrupt");
}
}
// read mime types
zimFile.seekg(header.getMimeListPos());
std::string mimeType;
while (true)
{
std::getline(zimFile, mimeType, '\0');
if (zimFile.fail())
throw ZimFileFormatError("error reading mime type list");
if (mimeType.empty())
break;
mimeTypes.push_back(mimeType);;
}
}
Dirent FileImpl::getDirent(size_type idx)
{
log_trace("FileImpl::getDirent(" << idx << ')');
if (idx >= indexOffsets.size())
if (idx >= getCountArticles())
throw ZimFileFormatError("article index out of range");
if (!zimFile)
@ -155,7 +129,9 @@ namespace zim
log_debug("dirent " << idx << " not found in cache; hits " << direntCache.getHits() << " misses " << direntCache.getMisses() << " ratio " << direntCache.hitRatio() * 100 << "% fillfactor " << direntCache.fillfactor());
zimFile.seekg(indexOffsets[idx]);
offset_type indexOffset = getOffset(header.getUrlPtrPos(), idx);
zimFile.seekg(indexOffset);
if (!zimFile)
{
log_warn("failed to seek to directory entry");
@ -171,18 +147,43 @@ namespace zim
throw ZimFileFormatError("failed to read directory entry");
}
log_debug("dirent read from " << indexOffsets[idx]);
log_debug("dirent read from " << indexOffset);
direntCache.put(idx, dirent);
return dirent;
}
Dirent FileImpl::getDirentByTitle(size_type idx)
{
if (idx >= getCountArticles())
throw ZimFileFormatError("article index out of range");
return getDirent(getIndexByTitle(idx));
}
size_type FileImpl::getIndexByTitle(size_type idx)
{
if (idx >= getCountArticles())
throw ZimFileFormatError("article index out of range");
zimFile.seekg(header.getTitleIdxPos() + sizeof(size_type) * idx);
size_type ret;
zimFile.read(reinterpret_cast<char*>(&ret), sizeof(size_type));
if (!zimFile)
throw ZimFileFormatError("error reading title index");
if (isBigEndian())
ret = fromLittleEndian(&ret);
return ret;
}
Cluster FileImpl::getCluster(size_type idx)
{
log_trace("getCluster(" << idx << ')');
if (idx >= clusterOffsets.size())
throw ZimFileFormatError("article index out of range");
if (idx >= getCountClusters())
throw ZimFileFormatError("cluster index out of range");
Cluster cluster = clusterCache.get(idx);
if (cluster)
@ -191,8 +192,9 @@ namespace zim
return cluster;
}
log_debug("read cluster " << idx << " from offset " << clusterOffsets[idx]);
zimFile.seekg(clusterOffsets[idx]);
offset_type clusterOffset = getClusterOffset(idx);
log_debug("read cluster " << idx << " from offset " << clusterOffset);
zimFile.seekg(clusterOffset);
zimFile >> cluster;
if (zimFile.fail())
@ -209,6 +211,21 @@ namespace zim
return cluster;
}
offset_type FileImpl::getOffset(offset_type ptrOffset, size_type idx)
{
zimFile.seekg(ptrOffset + sizeof(offset_type) * idx);
offset_type offset;
zimFile.read(reinterpret_cast<char*>(&offset), sizeof(offset_type));
if (!zimFile)
throw ZimFileFormatError("error reading offset");
if (isBigEndian())
offset = fromLittleEndian(&offset);
return offset;
}
size_type FileImpl::getNamespaceBeginOffset(char ch)
{
log_trace("getNamespaceBeginOffset(" << ch << ')');
@ -282,4 +299,16 @@ namespace zim
return namespaces;
}
const std::string& FileImpl::getMimeType(uint16_t idx) const
{
if (idx > mimeTypes.size())
{
std::ostringstream msg;
msg << "unknown mime type code " << idx;
throw std::runtime_error(msg.str());
}
return mimeTypes[idx];
}
}

View File

@ -48,7 +48,7 @@ namespace zim
void IndexArticle::readEntriesZ()
{
std::istringstream s(getParameter());
zim::IZIntStream extra(s);
zim::ZIntStream extra(s);
unsigned flagfield; // field with one bit (bits 0-3) for each cateogry
extra.get(flagfield);
@ -84,7 +84,7 @@ namespace zim
log_debug("read data from offset " << offset << " len " << len);
zim::Blob b = getData();
ptrstream data(const_cast<char*>(b.data() + offset), const_cast<char*>(b.data() + offset + len));
IZIntStream zdata(data);
ZIntStream zdata(data);
unsigned index;
unsigned indexOffset = 0;

View File

@ -67,8 +67,6 @@ namespace zim
InflateStreamBuf::int_type InflateStreamBuf::overflow(int_type c)
{
log_debug("InflateStreamBuf::overflow");
if (pptr())
{
// initialize input-stream for
@ -82,10 +80,8 @@ namespace zim
stream.next_out = (Bytef*)ibuffer();
stream.avail_out = ibuffer_size();
log_debug("pre:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in);
ret = ::inflate(&stream, Z_SYNC_FLUSH);
checkError(ret, stream);
log_debug("post:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in << " ret=" << ret);
// copy zbuffer to sinksource
std::streamsize count = ibuffer_size() - stream.avail_out;
@ -119,14 +115,12 @@ namespace zim
{
// there is data already available
// read compressed data from source into ibuffer
log_debug("in_avail=" << sinksource->in_avail());
stream.avail_in = sinksource->sgetn(ibuffer(), std::min(sinksource->in_avail(), ibuffer_size()));
}
else
{
// no data available
stream.avail_in = sinksource->sgetn(ibuffer(), ibuffer_size());
log_debug(stream.avail_in << " bytes read from source");
if (stream.avail_in == 0)
return traits_type::eof();
}
@ -138,9 +132,7 @@ namespace zim
// at least one character received from source - pass to decompressor
log_debug("pre:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in);
int ret = ::inflate(&stream, Z_SYNC_FLUSH);
log_debug("post:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in << " ret=" << ret);
checkError(ret, stream);

View File

@ -0,0 +1,181 @@
/*
* Copyright (C) 2009 Tommi Maekitalo
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
* NON-INFRINGEMENT. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include <zim/lzmastream.h>
#include "log.h"
#include <cstring>
#include <sstream>
log_define("zim.lzma.compress")
namespace zim
{
namespace
{
lzma_ret checkError(lzma_ret ret)
{
if (ret != LZMA_OK && ret != LZMA_STREAM_END)
{
std::ostringstream msg;
msg << "lzma-error " << ret;
switch (ret)
{
case LZMA_OK: msg << ": LZMA_OK"; break;
case LZMA_STREAM_END: msg << ": LZMA_STREAM_END"; break;
case LZMA_NO_CHECK: msg << ": LZMA_NO_CHECK"; break;
case LZMA_UNSUPPORTED_CHECK: msg << ": LZMA_UNSUPPORTED_CHECK"; break;
case LZMA_GET_CHECK: msg << ": LZMA_GET_CHECK"; break;
case LZMA_MEM_ERROR: msg << ": LZMA_MEM_ERROR"; break;
case LZMA_MEMLIMIT_ERROR: msg << ": LZMA_MEMLIMIT_ERROR"; break;
case LZMA_FORMAT_ERROR: msg << ": LZMA_FORMAT_ERROR"; break;
case LZMA_OPTIONS_ERROR: msg << ": LZMA_OPTIONS_ERROR"; break;
case LZMA_DATA_ERROR: msg << ": LZMA_DATA_ERROR"; break;
case LZMA_BUF_ERROR: msg << ": LZMA_BUF_ERROR"; break;
case LZMA_PROG_ERROR: msg << ": LZMA_PROG_ERROR"; break;
}
log_error(msg.str());
throw LzmaError(ret, msg.str());
}
return ret;
}
}
LzmaStreamBuf::LzmaStreamBuf(std::streambuf* sink_, uint32_t preset, lzma_check check, unsigned bufsize_)
: obuffer(bufsize_),
sink(sink_)
{
std::memset(reinterpret_cast<void*>(&stream), 0, sizeof(stream));
checkError(
::lzma_easy_encoder(&stream, preset, check));
setp(&obuffer[0], &obuffer[0] + obuffer.size());
}
LzmaStreamBuf::~LzmaStreamBuf()
{
::lzma_end(&stream);
}
LzmaStreamBuf::int_type LzmaStreamBuf::overflow(int_type c)
{
// initialize input-stream
stream.next_in = reinterpret_cast<const uint8_t*>(&obuffer[0]);
stream.avail_in = pptr() - &obuffer[0];
// initialize zbuffer for compressed data
char zbuffer[8192];
stream.next_out = reinterpret_cast<uint8_t*>(zbuffer);
stream.avail_out = sizeof(zbuffer);
// compress
checkError(::lzma_code(&stream, LZMA_RUN));
// copy zbuffer to sink / consume deflated data
std::streamsize count = sizeof(zbuffer) - stream.avail_out;
if (count > 0)
{
std::streamsize n = sink->sputn(zbuffer, count);
if (n < count)
return traits_type::eof();
}
// move remaining characters to start of obuffer
if (stream.avail_in > 0)
memmove(&obuffer[0], stream.next_in, stream.avail_in);
// reset outbuffer
setp(&obuffer[0] + stream.avail_in, &obuffer[0] + obuffer.size());
if (c != traits_type::eof())
sputc(traits_type::to_char_type(c));
return 0;
}
LzmaStreamBuf::int_type LzmaStreamBuf::underflow()
{
return traits_type::eof();
}
int LzmaStreamBuf::sync()
{
// initialize input-stream for
stream.next_in = reinterpret_cast<const uint8_t*>(&obuffer[0]);
stream.avail_in = pptr() - pbase();
char zbuffer[8192];
while (stream.avail_in > 0)
{
// initialize zbuffer
stream.next_out = (uint8_t*)zbuffer;
stream.avail_out = sizeof(zbuffer);
checkError(::lzma_code(&stream, LZMA_FINISH));
// copy zbuffer to sink
std::streamsize count = sizeof(zbuffer) - stream.avail_out;
if (count > 0)
{
std::streamsize n = sink->sputn(zbuffer, count);
if (n < count)
return -1;
}
};
// reset outbuffer
setp(&obuffer[0], &obuffer[0] + obuffer.size());
return 0;
}
int LzmaStreamBuf::end()
{
char zbuffer[8192];
// initialize input-stream for
stream.next_in = reinterpret_cast<const uint8_t*>(&obuffer[0]);
stream.avail_in = pptr() - pbase();
lzma_ret ret;
do
{
// initialize zbuffer
stream.next_out = (uint8_t*)zbuffer;
stream.avail_out = sizeof(zbuffer);
ret = checkError(::lzma_code(&stream, LZMA_FINISH));
// copy zbuffer to sink
std::streamsize count = sizeof(zbuffer) - stream.avail_out;
if (count > 0)
{
std::streamsize n = sink->sputn(zbuffer, count);
if (n < count)
return -1;
}
} while (ret != LZMA_STREAM_END);
// reset outbuffer
setp(&obuffer[0], &obuffer[0] + obuffer.size());
return 0;
}
void LzmaStream::end()
{
if (streambuf.end() != 0)
setstate(failbit);
}
}

View File

@ -39,8 +39,8 @@ namespace zim
bool operator() (const SearchResult& s1, const SearchResult& s2) const
{
return s1.getPriority() > s2.getPriority()
|| s1.getPriority() == s2.getPriority()
&& s1.getArticle().getTitle() > s2.getArticle().getTitle();
|| (s1.getPriority() == s2.getPriority()
&& s1.getArticle().getTitle() > s2.getArticle().getTitle());
}
};
}
@ -68,7 +68,7 @@ namespace zim
+ Search::getWeightOccOff()
+ Search::getWeightPlus() * itw->second.addweight;
std::string title = article.getTitle().toUtf8();
std::string title = article.getTitle();
for (std::string::iterator it = title.begin(); it != title.end(); ++it)
*it = std::tolower(*it);
@ -165,8 +165,7 @@ namespace zim
log_debug("search for token \"" << token << '"');
QUnicodeString qtoken = QUnicodeString::fromUtf8(token);
IndexArticle indexarticle = indexfile.getArticle('X', qtoken, true);
IndexArticle indexarticle = indexfile.getArticleByTitle('X', token);
if (indexarticle.getTotalCount() > 0)
{
@ -190,7 +189,7 @@ namespace zim
{
log_debug("no entries found - try searching for titles");
Results results;
find(results, 'A', qtoken);
find(results, 'A', token);
for (Results::const_iterator it = results.begin(); it != results.end(); ++it)
{
uint32_t articleIdx = it->getArticle().getIndex();
@ -224,13 +223,13 @@ namespace zim
std::sort(results.begin(), results.end(), PriorityGt());
}
void Search::find(Results& results, char ns, const QUnicodeString& praefix, unsigned limit)
void Search::find(Results& results, char ns, const std::string& praefix, unsigned limit)
{
log_debug("find results in namespace " << ns << " for praefix \"" << praefix << '"');
for (File::const_iterator pos = articlefile.find(ns, praefix, true);
for (File::const_iterator pos = articlefile.findByTitle(ns, praefix);
pos != articlefile.end() && results.size() < limit; ++pos)
{
if (ns != pos->getNamespace() || pos->getTitle().compareCollate(0, praefix.size(), praefix) > 0)
if (ns != pos->getNamespace() || pos->getTitle().compare(0, praefix.size(), praefix) > 0)
{
log_debug("article " << pos->getNamespace() << ", \"" << pos->getTitle() << "\" does not match " << ns << ", \"" << praefix << '"');
break;
@ -240,17 +239,17 @@ namespace zim
log_debug(results.size() << " articles in result");
}
void Search::find(Results& results, char ns, const QUnicodeString& begin,
const QUnicodeString& end, unsigned limit)
void Search::find(Results& results, char ns, const std::string& begin,
const std::string& end, unsigned limit)
{
log_debug("find results in namespace " << ns << " for praefix \"" << begin << '"');
for (File::const_iterator pos = articlefile.find(ns, begin, true);
for (File::const_iterator pos = articlefile.findByTitle(ns, begin);
pos != articlefile.end() && results.size() < limit; ++pos)
{
log_debug("check " << pos->getNamespace() << '/' << pos->getTitle());
if (pos->getNamespace() != ns || pos->getTitle().compareCollate(0, end.size(), end) > 0)
if (pos->getNamespace() != ns || pos->getTitle().compare(end) > 0)
{
log_debug("article \"" << pos->getUrl() << "\" does not match");
log_debug("article " << pos->getNamespace() << ", \"" << pos->getTitle() << "\" does not match");
break;
}
results.push_back(SearchResult(*pos));

View File

@ -0,0 +1,163 @@
/*
* Copyright (C) 2009 Tommi Maekitalo
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "zim/unlzmastream.h"
#include "log.h"
#include "config.h"
#include <sstream>
#include <cstring>
#include "envvalue.h"
log_define("zim.lzma.uncompress")
namespace zim
{
namespace
{
lzma_ret checkError(lzma_ret ret)
{
if (ret != LZMA_OK && ret != LZMA_STREAM_END)
{
std::ostringstream msg;
msg << "inflate-error " << ret;
switch (ret)
{
case LZMA_OK: msg << ": LZMA_OK"; break;
case LZMA_STREAM_END: msg << ": LZMA_STREAM_END"; break;
case LZMA_NO_CHECK: msg << ": LZMA_NO_CHECK"; break;
case LZMA_UNSUPPORTED_CHECK: msg << ": LZMA_UNSUPPORTED_CHECK"; break;
case LZMA_GET_CHECK: msg << ": LZMA_GET_CHECK"; break;
case LZMA_MEM_ERROR: msg << ": LZMA_MEM_ERROR"; break;
case LZMA_MEMLIMIT_ERROR: msg << ": LZMA_MEMLIMIT_ERROR"; break;
case LZMA_FORMAT_ERROR: msg << ": LZMA_FORMAT_ERROR"; break;
case LZMA_OPTIONS_ERROR: msg << ": LZMA_OPTIONS_ERROR"; break;
case LZMA_DATA_ERROR: msg << ": LZMA_DATA_ERROR"; break;
case LZMA_BUF_ERROR: msg << ": LZMA_BUF_ERROR"; break;
case LZMA_PROG_ERROR: msg << ": LZMA_PROG_ERROR"; break;
}
log_error(msg);
throw UnlzmaError(ret, msg.str());
}
return ret;
}
}
UnlzmaStreamBuf::UnlzmaStreamBuf(std::streambuf* sinksource_, unsigned bufsize_)
: iobuffer(new char_type[bufsize_]),
bufsize(bufsize_),
sinksource(sinksource_)
{
std::memset(reinterpret_cast<void*>(&stream), 0, sizeof(stream));
unsigned memsize = envMemSize("ZIM_LZMA_MEMORY_SIZE", LZMA_MEMORY_SIZE * 1024 * 1024);
checkError(
::lzma_stream_decoder(&stream, memsize, 0));
}
UnlzmaStreamBuf::~UnlzmaStreamBuf()
{
::lzma_end(&stream);
delete[] iobuffer;
}
UnlzmaStreamBuf::int_type UnlzmaStreamBuf::overflow(int_type c)
{
if (pptr())
{
// initialize input-stream for
stream.next_in = reinterpret_cast<const uint8_t*>(obuffer());
stream.avail_in = pptr() - pbase();
lzma_ret ret;
do
{
// initialize ibuffer
stream.next_out = reinterpret_cast<uint8_t*>(ibuffer());
stream.avail_out = ibuffer_size();
ret = ::lzma_code(&stream, LZMA_RUN);
checkError(ret);
// copy zbuffer to sinksource
std::streamsize count = ibuffer_size() - stream.avail_out;
std::streamsize n = sinksource->sputn(reinterpret_cast<char*>(ibuffer()), count);
if (n < count)
return traits_type::eof();
} while (ret != LZMA_STREAM_END && stream.avail_in > 0);
}
// reset outbuffer
setp(obuffer(), obuffer() + obuffer_size());
if (c != traits_type::eof())
sputc(traits_type::to_char_type(c));
return 0;
}
UnlzmaStreamBuf::int_type UnlzmaStreamBuf::underflow()
{
// read from sinksource and decompress into obuffer
stream.next_out = reinterpret_cast<uint8_t*>(obuffer());
stream.avail_out = obuffer_size();
do
{
// fill ibuffer first if needed
if (stream.avail_in == 0)
{
if (sinksource->in_avail() > 0)
{
// there is data already available
// read compressed data from source into ibuffer
stream.avail_in = sinksource->sgetn(ibuffer(), std::min(sinksource->in_avail(), ibuffer_size()));
}
else
{
// no data available
stream.avail_in = sinksource->sgetn(ibuffer(), ibuffer_size());
if (stream.avail_in == 0)
return traits_type::eof();
}
stream.next_in = (const uint8_t*)ibuffer();
}
// we decompress it now into obuffer
// at least one character received from source - pass to decompressor
checkError(::lzma_code(&stream, LZMA_RUN));
setg(obuffer(), obuffer(), obuffer() + obuffer_size() - stream.avail_out);
} while (gptr() == egptr());
return sgetc();
}
int UnlzmaStreamBuf::sync()
{
if (pptr() && overflow(traits_type::eof()) == traits_type::eof())
return -1;
return 0;
}
}

View File

@ -18,86 +18,85 @@
*/
#include <zim/zintstream.h>
#include <stdint.h>
#include "log.h"
log_define("zim.zintstream")
namespace zim
{
IZIntStream& IZIntStream::get(unsigned &value)
size_type ZIntStream::get()
{
char ch;
if (!stream.get(ch))
if (!_istream->get(ch))
return *this;
unsigned ret = static_cast<unsigned>(static_cast<unsigned char>(ch));
unsigned numb = ret & 0x3;
ret >>= 2;
unsigned s = 6;
while (numb && stream.get(ch))
if (ch == '\xff')
{
ret += static_cast<unsigned>(
static_cast<unsigned char>(ch)) + 1 << s;
s += 8;
--numb;
log_error("invalid bytestream in int decompressor");
_istream->setstate(std::ios::failbit);
}
size_type uuvalue = static_cast<size_type>(static_cast<unsigned char>(ch));
uint64_t ubound = 0x80;
size_type add = 0;
unsigned short s = 7;
unsigned short N = 0;
size_type mask = 0x7F;
while (ch & 0x80)
{
++N;
ch <<= 1;
--s;
add += ubound;
ubound <<= 7;
mask >>= 1;
}
if (numb)
uuvalue &= mask;
while (N-- && _istream->get(ch))
{
log_error("incomplete bytestream");
stream.setstate(std::ios::failbit);
uuvalue |= static_cast<size_type>(static_cast<unsigned char>(ch)) << s;
s += 8;
}
if (_istream)
{
uuvalue += add;
}
else
value = ret;
{
log_error("incomplete bytestream in int decompressor");
_istream->setstate(std::ios::failbit);
}
return *this;
return uuvalue;
}
OZIntStream& OZIntStream::put(size_type value)
ZIntStream& ZIntStream::put(size_type value)
{
char data[4];
unsigned count;
if (value < 64)
size_type nmask = 0;
size_type mask = 0x7F;
uint64_t ubound = 0x80;
unsigned short N = 0;
while (value >= ubound)
{
count = 1;
data[0] = (value << 2);
log_debug(value << " => " << std::hex << static_cast<unsigned>(static_cast<unsigned char>(data[0])));
}
else if (value < 16384 + 64)
{
value -= 64;
count = 2;
data[0] = value << 2 | 1;
data[1] = value >> 6;
log_debug(value << " => " << std::hex << static_cast<unsigned>(static_cast<unsigned char>(data[0]))
<< std::hex << static_cast<unsigned>(static_cast<unsigned char>(data[1])));
}
else if (value < 4194304 + 16384 + 64)
{
value -= 16384 + 64;
count = 3;
data[0] = value << 2 | 2;
data[1] = value >> 6;
data[2] = value >> 14;
log_debug(value << " => " << std::hex << static_cast<unsigned>(static_cast<unsigned char>(data[0]))
<< std::hex << static_cast<unsigned>(static_cast<unsigned char>(data[1]))
<< std::hex << static_cast<unsigned>(static_cast<unsigned char>(data[2])));
}
else
{
value -= 4194304 + 16384 + 64;
count = 4;
data[0] = value << 2 | 3;
data[1] = value >> 6;
data[2] = value >> 14;
data[3] = value >> 22;
log_debug(value << " => " << std::hex << static_cast<unsigned>(static_cast<unsigned char>(data[0]))
<< std::hex << static_cast<unsigned>(static_cast<unsigned char>(data[1]))
<< std::hex << static_cast<unsigned>(static_cast<unsigned char>(data[2]))
<< std::hex << static_cast<unsigned>(static_cast<unsigned char>(data[4])));
value -= ubound;
ubound <<= 7;
nmask = (nmask >> 1) | 0x80;
mask = mask >> 1;
++N;
}
stream.write(reinterpret_cast<char*>(&data[0]), count);
_ostream->put(static_cast<char>(nmask | (value & mask)));
value >>= 7 - N;
while (N--)
{
_ostream->put(static_cast<char>(value & 0xFF));
value >>= 8;
}
return *this;
}