diff --git a/src/server/kiwix-serve.cpp b/src/server/kiwix-serve.cpp index 04e239a..da354f8 100644 --- a/src/server/kiwix-serve.cpp +++ b/src/server/kiwix-serve.cpp @@ -80,7 +80,7 @@ static int accessHandlerCallback(void *cls, /* Load the article from the ZIM file */ cout << "Loading '" << title << "' in namespace '" << ns << "'" << endl; try { - std::pair resultPair = zimFileHandler->findx(ns[0], zim::QUnicodeString(title)); + std::pair resultPair = zimFileHandler->findx(ns[0], title); /* Test if the article was found */ if (resultPair.first == true) { diff --git a/src/zimlib/include/zim/article.h b/src/zimlib/include/zim/article.h index 5c3edb9..233dbf0 100644 --- a/src/zimlib/include/zim/article.h +++ b/src/zimlib/include/zim/article.h @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -50,11 +49,13 @@ namespace zim std::string getParameter() const { return getDirent().getParameter(); } - QUnicodeString getTitle() const { return getDirent().getTitle(); } + std::string getTitle() const { return getDirent().getTitle(); } + std::string getUrl() const { return getDirent().getUrl(); } + std::string getLongUrl() const { return getDirent().getLongUrl(); } - MimeType getLibraryMimeType() const { return getDirent().getMimeType(); } + uint16_t getLibraryMimeType() const { return getDirent().getMimeType(); } const std::string& - getMimeType() const; + getMimeType() const { return file.getMimeType(getLibraryMimeType()); } bool isRedirect() const { return getDirent().isRedirect(); } @@ -67,8 +68,8 @@ namespace zim bool operator< (const Article& a) const { return getNamespace() < a.getNamespace() - || getNamespace() == a.getNamespace() - && getTitle() < a.getTitle(); } + || (getNamespace() == a.getNamespace() + && getTitle() < a.getTitle()); } Cluster getCluster() const { return file.getCluster(getDirent().getClusterNumber()); } @@ -87,8 +88,6 @@ namespace zim File& getFile() { return file; } size_type getIndex() const { return idx; } - QUnicodeString getUrl() const { return getDirent().getUrl(); } - bool good() const { return idx != std::numeric_limits::max(); } }; diff --git a/src/zimlib/include/zim/blob.h b/src/zimlib/include/zim/blob.h index d7f7d74..4feabf7 100644 --- a/src/zimlib/include/zim/blob.h +++ b/src/zimlib/include/zim/blob.h @@ -42,9 +42,9 @@ namespace zim { } Blob(ClusterImpl* cluster, const char* data, unsigned size) - : _cluster(cluster), - _data(data), - _size(size) + : _data(data), + _size(size), + _cluster(cluster) { } const char* data() const { return _data; } diff --git a/src/zimlib/include/zim/dirent.h b/src/zimlib/include/zim/dirent.h index 555e64c..514db57 100644 --- a/src/zimlib/include/zim/dirent.h +++ b/src/zimlib/include/zim/dirent.h @@ -22,14 +22,16 @@ #include #include -#include +#include namespace zim { class Dirent { bool redirect; - MimeType mimeType; + uint16_t mimeType; + + size_type version; size_type clusterNumber; // only used when redirect is false size_type blobNumber; // only used when redirect is false @@ -37,14 +39,18 @@ namespace zim size_type redirectIndex; // only used when redirect is true char ns; - QUnicodeString title; + std::string title; + std::string url; std::string parameter; public: Dirent() {} bool isRedirect() const { return redirect; } - MimeType getMimeType() const { return mimeType; } + uint16_t getMimeType() const { return mimeType; } + + size_type getVersion() const { return version; } + void setVersion(size_type v) { version = v; } size_type getClusterNumber() const { return isRedirect() ? 0 : clusterNumber; } size_type getBlobNumber() const { return isRedirect() ? 0 : blobNumber; } @@ -54,26 +60,28 @@ namespace zim size_type getRedirectIndex() const { return isRedirect() ? redirectIndex : 0; } char getNamespace() const { return ns; } - const QUnicodeString& getTitle() const { return title; } + const std::string& getTitle() const { return title.empty() ? url : title; } + const std::string& getUrl() const { return url; } + std::string getLongUrl() const; const std::string& getParameter() const { return parameter; } - uint16_t getExtraLen() const - { - uint16_t s = title.getValue().size(); - if (!parameter.empty()) - s += (parameter.size() + 1); - return s; - } - unsigned getDirentSize() const { - return (isRedirect() ? 10 : 14) + getExtraLen(); + unsigned ret = (isRedirect() ? 12 : 16) + url.size() + parameter.size() + 2; + if (title != url) + ret += title.size(); + return ret; } - void setTitle(char ns_, const QUnicodeString& title_) + void setTitle(const std::string& title_) + { + title = title_; + } + + void setUrl(char ns_, const std::string& url_) { ns = ns_; - title = title_; + url = url_; } void setParameter(const std::string& parameter_) @@ -85,12 +93,12 @@ namespace zim { redirect = true; redirectIndex = idx; - mimeType = zimMimeNone; + mimeType = std::numeric_limits::max(); clusterNumber = 0; blobNumber = 0; } - void setArticle(MimeType mimeType_, size_type clusterNumber_, size_type blobNumber_) + void setArticle(uint16_t mimeType_, size_type clusterNumber_, size_type blobNumber_) { redirect = false; mimeType = mimeType_; @@ -98,7 +106,6 @@ namespace zim blobNumber = blobNumber_; } - QUnicodeString getUrl() const; }; std::ostream& operator<< (std::ostream& out, const Dirent& fh); diff --git a/src/zimlib/include/zim/file.h b/src/zimlib/include/zim/file.h index abc9c1e..3dd6a6b 100644 --- a/src/zimlib/include/zim/file.h +++ b/src/zimlib/include/zim/file.h @@ -45,11 +45,14 @@ namespace zim const std::string& getFilename() const { return impl->getFilename(); } const Fileheader& getFileheader() const { return impl->getFileheader(); } - Dirent getDirent(size_type idx); + Dirent getDirent(size_type idx) { return impl->getDirent(idx); } + Dirent getDirentByTitle(size_type idx) { return impl->getDirentByTitle(idx); } size_type getCountArticles() const { return impl->getCountArticles(); } Article getArticle(size_type idx) const; - Article getArticle(char ns, const QUnicodeString& title, bool collate = false); + Article getArticle(char ns, const std::string& url); + Article getArticleByTitle(size_type idx); + Article getArticleByTitle(char ns, const std::string& title); Cluster getCluster(size_type idx) const { return impl->getCluster(idx); } size_type getCountClusters() const { return impl->getCountClusters(); } @@ -72,12 +75,17 @@ namespace zim class const_iterator; const_iterator begin(); + const_iterator beginByTitle(); const_iterator end(); - std::pair findx(char ns, const QUnicodeString& title, bool collate = false); - const_iterator find(char ns, const QUnicodeString& title, bool collate = false); + std::pair findxByTitle(char ns, const std::string& title); + std::pair findx(char ns, const std::string& url); + const_iterator findByTitle(char ns, const std::string& title); + const_iterator find(char ns, const std::string& url); bool good() const { return impl.getPointer() != 0; } time_t getMTime() const { return impl->getMTime(); } + + const std::string& getMimeType(uint16_t idx) const { return impl->getMimeType(idx); } }; } diff --git a/src/zimlib/include/zim/fileheader.h b/src/zimlib/include/zim/fileheader.h index 09f2c28..6969ee3 100644 --- a/src/zimlib/include/zim/fileheader.h +++ b/src/zimlib/include/zim/fileheader.h @@ -38,7 +38,9 @@ namespace zim private: Uuid uuid; size_type articleCount; - offset_type indexPtrPos; + offset_type titleIdxPos; + offset_type urlPtrPos; + offset_type mimeListPos; size_type blobCount; offset_type blobPtrPos; size_type mainPage; @@ -47,7 +49,8 @@ namespace zim public: Fileheader() : articleCount(0), - indexPtrPos(0), + titleIdxPos(0), + urlPtrPos(0), blobCount(0), blobPtrPos(0), mainPage(std::numeric_limits::max()), @@ -60,22 +63,28 @@ namespace zim size_type getArticleCount() const { return articleCount; } void setArticleCount(size_type s) { articleCount = s; } - offset_type getIndexPtrPos() const { return indexPtrPos; } - void setIndexPtrPos(offset_type p) { indexPtrPos = p; } + offset_type getTitleIdxPos() const { return titleIdxPos; } + void setTitleIdxPos(offset_type p) { titleIdxPos = p; } - size_type getClusterCount() const { return blobCount; } - void setClusterCount(size_type s) { blobCount = s; } + offset_type getUrlPtrPos() const { return urlPtrPos; } + void setUrlPtrPos(offset_type p) { urlPtrPos = p; } + + offset_type getMimeListPos() const { return mimeListPos; } + void setMimeListPos(offset_type p) { mimeListPos = p; } + + size_type getClusterCount() const { return blobCount; } + void setClusterCount(size_type s) { blobCount = s; } offset_type getClusterPtrPos() const { return blobPtrPos; } void setClusterPtrPos(offset_type p) { blobPtrPos = p; } - bool hasMainPage() const { return mainPage != std::numeric_limits::max(); } - size_type getMainPage() const { return mainPage; } - void setMainPage(size_type s) { mainPage = s; } + bool hasMainPage() const { return mainPage != std::numeric_limits::max(); } + size_type getMainPage() const { return mainPage; } + void setMainPage(size_type s) { mainPage = s; } - bool hasLayoutPage() const { return layoutPage != std::numeric_limits::max(); } - size_type getLayoutPage() const { return layoutPage; } - void setLayoutPage(size_type s) { layoutPage = s; } + bool hasLayoutPage() const { return layoutPage != std::numeric_limits::max(); } + size_type getLayoutPage() const { return layoutPage; } + void setLayoutPage(size_type s) { layoutPage = s; } }; std::ostream& operator<< (std::ostream& out, const Fileheader& fh); diff --git a/src/zimlib/include/zim/fileimpl.h b/src/zimlib/include/zim/fileimpl.h index 209d1a5..3632f0d 100644 --- a/src/zimlib/include/zim/fileimpl.h +++ b/src/zimlib/include/zim/fileimpl.h @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -40,10 +39,6 @@ namespace zim Fileheader header; std::string filename; - typedef std::vector OffsetsType; - OffsetsType indexOffsets; - OffsetsType clusterOffsets; - Cache direntCache; Cache clusterCache; typedef std::map NamespaceCache; @@ -53,6 +48,11 @@ namespace zim std::string namespaces; time_t mtime; + typedef std::vector MimeTypes; + MimeTypes mimeTypes; + + offset_type getOffset(offset_type ptrOffset, size_type idx); + public: explicit FileImpl(const char* fname); @@ -62,11 +62,13 @@ namespace zim const Fileheader& getFileheader() const { return header; } Dirent getDirent(size_type idx); - size_type getCountArticles() const { return indexOffsets.size(); } + Dirent getDirentByTitle(size_type idx); + size_type getIndexByTitle(size_type idx); + size_type getCountArticles() const { return header.getArticleCount(); } Cluster getCluster(size_type idx); - size_type getCountClusters() const { return clusterOffsets.size(); } - offset_type getClusterOffset(size_type idx) const { return clusterOffsets[idx]; } + size_type getCountClusters() const { return header.getClusterCount(); } + offset_type getClusterOffset(size_type idx) { return getOffset(header.getClusterPtrPos(), idx); } size_type getNamespaceBeginOffset(char ch); size_type getNamespaceEndOffset(char ch); @@ -76,6 +78,7 @@ namespace zim std::string getNamespaces(); bool hasNamespace(char ch); + const std::string& getMimeType(uint16_t idx) const; }; } diff --git a/src/zimlib/include/zim/fileiterator.h b/src/zimlib/include/zim/fileiterator.h index dbc9d6d..833f59e 100644 --- a/src/zimlib/include/zim/fileiterator.h +++ b/src/zimlib/include/zim/fileiterator.h @@ -27,30 +27,40 @@ namespace zim { class File::const_iterator : public std::iterator { + public: + enum Mode { + UrlIterator, + ArticleIterator + }; + + private: File* file; size_type idx; mutable Article article; + Mode mode; bool is_end() const { return file == 0 || idx >= file->getCountArticles(); } public: - explicit const_iterator(File* file_ = 0, size_type idx_ = 0) + explicit const_iterator(File* file_ = 0, size_type idx_ = 0, Mode mode_ = UrlIterator) : file(file_), - idx(idx_) + idx(idx_), + mode(mode_) { } size_type getIndex() const { return idx; } const File& getFile() const { return *file; } bool operator== (const const_iterator& it) const - { return is_end() && it.is_end() - || file == it.file && idx == it.idx; } + { return (is_end() && it.is_end()) + || (file == it.file && idx == it.idx); } bool operator!= (const const_iterator& it) const { return !operator==(it); } const_iterator& operator++() { ++idx; + article = Article(); return *this; } @@ -64,6 +74,7 @@ namespace zim const_iterator& operator--() { --idx; + article = Article(); return *this; } @@ -74,17 +85,17 @@ namespace zim return *this; } - Article operator*() const + const Article& operator*() const { - if (article.getIndex() != idx) - article = file->getArticle(idx); + if (!article.good()) + article = mode == UrlIterator ? file->getArticle(idx) + : file->getArticleByTitle(idx); return article; } pointer operator->() const { - if (article.getIndex() != idx) - article = file->getArticle(idx); + operator*(); return &article; } diff --git a/src/zimlib/include/zim/lzmastream.h b/src/zimlib/include/zim/lzmastream.h new file mode 100644 index 0000000..4bbb377 --- /dev/null +++ b/src/zimlib/include/zim/lzmastream.h @@ -0,0 +1,94 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_LZMASTREAM_H +#define ZIM_LZMASTREAM_H + +#include +#include +#include +#include + +namespace zim +{ + class LzmaError : public std::runtime_error + { + lzma_ret ret; + + public: + LzmaError(lzma_ret ret_, const std::string& msg) + : std::runtime_error(msg), + ret(ret_) + { } + + lzma_ret getRetcode() const { return ret; } + }; + + class LzmaStreamBuf : public std::streambuf + { + lzma_stream stream; + std::vector obuffer; + std::streambuf* sink; + + public: + LzmaStreamBuf(std::streambuf* sink_, + uint32_t preset = 3 | LZMA_PRESET_EXTREME, + lzma_check check = LZMA_CHECK_CRC32 /* LZMA_CHECK_NONE */, + unsigned bufsize = 8192); + ~LzmaStreamBuf(); + + /// see std::streambuf + int_type overflow(int_type c); + /// see std::streambuf + int_type underflow(); + /// see std::streambuf + int sync(); + /// end stream + int end(); + + void setSink(std::streambuf* sink_) { sink = sink_; } + }; + + class LzmaStream : public std::ostream + { + LzmaStreamBuf streambuf; + + public: + explicit LzmaStream(std::streambuf* sink, + uint32_t preset = 3 | LZMA_PRESET_EXTREME, + lzma_check check = LZMA_CHECK_CRC32 /* LZMA_CHECK_NONE */, + unsigned bufsize = 8192) + : std::ostream(0), + streambuf(sink, preset, check, bufsize) + { init(&streambuf); } + explicit LzmaStream(std::ostream& sink, + uint32_t preset = 3 | LZMA_PRESET_EXTREME, + lzma_check check = LZMA_CHECK_CRC32 /* LZMA_CHECK_NONE */, + unsigned bufsize = 8192) + : std::ostream(0), + streambuf(sink.rdbuf(), preset, check, bufsize) + { init(&streambuf); } + + void end(); + void setSink(std::streambuf* sink) { streambuf.setSink(sink); } + void setSink(std::ostream& sink) { streambuf.setSink(sink.rdbuf()); } + }; +} + +#endif // ZIM_LZMASTREAM_H diff --git a/src/zimlib/include/zim/search.h b/src/zimlib/include/zim/search.h index c149239..6197d7c 100644 --- a/src/zimlib/include/zim/search.h +++ b/src/zimlib/include/zim/search.h @@ -93,8 +93,8 @@ namespace zim { } void search(Results& results, const std::string& expr); - void find(Results& results, char ns, const QUnicodeString& praefix, unsigned limit = searchLimit); - void find(Results& results, char ns, const QUnicodeString& begin, const QUnicodeString& end, unsigned limit = searchLimit); + void find(Results& results, char ns, const std::string& praefix, unsigned limit = searchLimit); + void find(Results& results, char ns, const std::string& begin, const std::string& end, unsigned limit = searchLimit); static double getWeightOcc() { return weightOcc; } static double getWeightOccOff() { return weightOccOff; } diff --git a/src/zimlib/include/zim/smartptr.h b/src/zimlib/include/zim/smartptr.h index 6c5bd2e..796c9dd 100644 --- a/src/zimlib/include/zim/smartptr.h +++ b/src/zimlib/include/zim/smartptr.h @@ -31,25 +31,6 @@ namespace zim { - template - class InternalRefCounted - { - protected: - bool unlink(objectType* object) - { - if (object) - object->release(); - return false; - } - - void link(const InternalRefCounted& ptr, objectType* object) - { - if (object) - object->addRef(); - } - - }; - template class SmartPtr { diff --git a/src/zimlib/include/zim/template.h b/src/zimlib/include/zim/template.h index d274fd5..0a64cdb 100644 --- a/src/zimlib/include/zim/template.h +++ b/src/zimlib/include/zim/template.h @@ -32,7 +32,7 @@ namespace zim public: virtual void onData(const std::string& data) = 0; virtual void onToken(const std::string& token) = 0; - virtual void onLink(char ns, const std::string& title) = 0; + virtual void onLink(char ns, const std::string& url) = 0; }; private: diff --git a/src/zimlib/include/zim/unlzmastream.h b/src/zimlib/include/zim/unlzmastream.h new file mode 100644 index 0000000..6ef8c00 --- /dev/null +++ b/src/zimlib/include/zim/unlzmastream.h @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + + +#ifndef ZIM_UNLZMASTREAM_H +#define ZIM_UNLZMASTREAM_H + +#include +#include +#include + +namespace zim +{ + class UnlzmaError : public std::runtime_error + { + lzma_ret ret; + + public: + UnlzmaError(lzma_ret ret_, const std::string& msg) + : std::runtime_error(msg), + ret(ret_) + { } + + lzma_ret getRetcode() const { return ret; } + }; + + class UnlzmaStreamBuf : public std::streambuf + { + lzma_stream stream; + char_type* iobuffer; + unsigned bufsize; + std::streambuf* sinksource; + + char_type* ibuffer() { return iobuffer; } + std::streamsize ibuffer_size() { return bufsize >> 1; } + char_type* obuffer() { return iobuffer + ibuffer_size(); } + std::streamsize obuffer_size() { return bufsize >> 1; } + + public: + explicit UnlzmaStreamBuf(std::streambuf* sinksource_, unsigned bufsize = 8192); + ~UnlzmaStreamBuf(); + + /// see std::streambuf + int_type overflow(int_type c); + /// see std::streambuf + int_type underflow(); + /// see std::streambuf + int sync(); + + void setSinksource(std::streambuf* sinksource_) { sinksource = sinksource_; } + }; + + class UnlzmaStream : public std::iostream + { + UnlzmaStreamBuf streambuf; + + public: + explicit UnlzmaStream(std::streambuf* sinksource, unsigned bufsize = 8192) + : std::iostream(0), + streambuf(sinksource, bufsize) + { init(&streambuf); } + explicit UnlzmaStream(std::ios& sinksource, unsigned bufsize = 8192) + : std::iostream(0), + streambuf(sinksource.rdbuf(), bufsize) + { init(&streambuf); } + + void setSinksource(std::streambuf* sinksource) { streambuf.setSinksource(sinksource); } + void setSinksource(std::ios& sinksource) { streambuf.setSinksource(sinksource.rdbuf()); } + void setSink(std::ostream& sink) { streambuf.setSinksource(sink.rdbuf()); } + void setSource(std::istream& source) { streambuf.setSinksource(source.rdbuf()); } + }; +} + +#endif // ZIM_UNLZMASTREAM_H + diff --git a/src/zimlib/include/zim/zim.h b/src/zimlib/include/zim/zim.h index 3787b8f..e2eba8c 100644 --- a/src/zimlib/include/zim/zim.h +++ b/src/zimlib/include/zim/zim.h @@ -42,23 +42,7 @@ namespace zim zimcompLzma }; - enum MimeType - { - zimMimeNone = -1, - zimMimeTextHtml, - zimMimeTextPlain, - zimMimeImageJpeg, - zimMimeImagePng, - zimMimeImageTiff, - zimMimeTextCss, - zimMimeImageGif, - zimMimeIndex, - zimMimeApplicationJavaScript, - zimMimeImageIcon, - zimMimeTextXml, - zimMimeTextHtmlTemplate - }; - + static const char MimeHtmlTemplate[] = "text/x-zim-htmltemplate"; } #endif // ZIM_ZIM_H diff --git a/src/zimlib/include/zim/zintstream.h b/src/zimlib/include/zim/zintstream.h index 81ecb3d..1c78a1f 100644 --- a/src/zimlib/include/zim/zintstream.h +++ b/src/zimlib/include/zim/zintstream.h @@ -24,32 +24,74 @@ #include #include +/* + ZInt implements a int compressor and decompressor. The algorithm compresses + small values into fewer bytes. + + The idea is to add information about used bytes in the first byte. The number + of additional bytes used is specified by the number of set bits counted from + the most significant bit. So the numbers 0-127 are encoded as is, since they + fit into the 7 low order bits and the high order bit specifies, that no + additional bytes are used. The number starting from 128 up to 16383 need more + than 7 bits, so we need to set the highest order bit to 1 and the next bit to + 0, leaving 6 bits of actual data, which is used as the low order bits of the + number. + + Since the numbers 0-127 are already encoded in one byte, the 127 is + substracted from the actual number, so a 2 byte zero is actually a 128. + + The same logic continues on the 3rd, 4th, ... byte. Up to 7 additional bytes + are used, so the first byte must contain at least one 0. + + binary range + ------------------------------- -------------------------------------------------- + 0xxx xxxx 0 - 127 + 10xx xxxx xxxx xxxx 128 - (2^14+128-1 = 16511) + 110x xxxx xxxx xxxx xxxx xxxx 16512 - (2^21+16512-1 = 2113663) + 1110 xxxx xxxx xxxx xxxx xxxx xxxx xxxx + 2113664 - (2^28+2113664-1 = 270549119) + ... + +*/ + namespace zim { - class IZIntStream + class ZIntStream { - std::istream& stream; + std::istream* _istream; + std::ostream* _ostream; public: - explicit IZIntStream(std::istream& stream_) - : stream(stream_) - { } + /// prepare ZIntStream for compression or decompression + explicit ZIntStream(std::iostream& iostream) + : _istream(&iostream), + _ostream(&iostream) + { } - IZIntStream& get(size_type &value); - operator void*() const { return stream; } - }; + /// prepare ZIntStream for decompression + explicit ZIntStream(std::istream& istream) + : _istream(&istream), + _ostream(0) + { } - class OZIntStream - { - std::ostream& stream; + /// prepare ZIntStream for compression + explicit ZIntStream(std::ostream& ostream) + : _istream(0), + _ostream(&ostream) + { } - public: - explicit OZIntStream(std::ostream& stream_) - : stream(stream_) - { } + /// decompresses one value from input stream and returns it + size_type get(); - OZIntStream& put(size_type value); - operator void*() const { return stream; } + ZIntStream& get(size_type &value) + { value = get(); return *this; } + + /// compresses one value to output stream + ZIntStream& put(size_type value); + + operator bool() const + { return (_istream == 0 || *_istream) + && (_ostream == 0 || *_ostream); } }; } diff --git a/src/zimlib/src/article.cpp b/src/zimlib/src/article.cpp index b47120f..e2c0889 100644 --- a/src/zimlib/src/article.cpp +++ b/src/zimlib/src/article.cpp @@ -28,50 +28,6 @@ log_define("zim.article") namespace zim { - const std::string& Article::getMimeType() const - { - static const std::string textHtml = "text/html; charset=UTF-8"; - static const std::string textPlain = "text/plain"; - static const std::string textXml = "application/xml"; - static const std::string imageJpeg = "image/jpeg"; - static const std::string imagePng = "image/png"; - static const std::string imageTiff = "image/tiff"; - static const std::string textCss = "text/css"; - static const std::string imageGif = "image/gif"; - static const std::string index = "text/plain"; - static const std::string applicationJavaScript = "application/x-javascript"; - static const std::string imageIcon = "image/x-icon"; - - switch (getLibraryMimeType()) - { - case zimMimeTextHtml: - case zimMimeTextHtmlTemplate: - return textHtml; - case zimMimeTextPlain: - return textPlain; - case zimMimeImageJpeg: - return imageJpeg; - case zimMimeImagePng: - return imagePng; - case zimMimeImageTiff: - return imageTiff; - case zimMimeTextCss: - return textCss; - case zimMimeImageGif: - return imageGif; - case zimMimeIndex: - return index; - case zimMimeApplicationJavaScript: - return applicationJavaScript; - case zimMimeImageIcon: - return imageIcon; - case zimMimeTextXml: - return textXml; - } - - return textHtml; - } - size_type Article::getArticleSize() const { Dirent dirent = getDirent(); @@ -108,9 +64,9 @@ namespace zim log_trace("onToken(\"" << token << "\")"); if (token == "title") - out << article.getTitle().toUtf8(); + out << article.getTitle(); else if (token == "url") - out << article.getUrl().toUtf8(); + out << article.getUrl(); else if (token == "namespace") out << article.getNamespace(); else if (token == "content") @@ -126,11 +82,11 @@ namespace zim } } - void Ev::onLink(char ns, const std::string& title) + void Ev::onLink(char ns, const std::string& url) { if (maxRecurse <= 0) throw std::runtime_error("maximum recursive limit is reached"); - article.getFile().getArticle(ns, QUnicodeString::fromUtf8(title)).getPage(out, false, maxRecurse - 1); + article.getFile().getArticle(ns, url).getPage(out, false, maxRecurse - 1); } } @@ -146,7 +102,7 @@ namespace zim { log_trace("Article::getPage(" << layout << ", " << maxRecurse << ')'); - if (getLibraryMimeType() == zimMimeTextHtml || getLibraryMimeType() == zimMimeTextHtmlTemplate) + if (getMimeType().compare(0, 9, "text/html") == 0 || getMimeType() == MimeHtmlTemplate) { if (layout && file.getFileheader().hasLayoutPage()) { @@ -162,7 +118,7 @@ namespace zim return; } - else if (getLibraryMimeType() == zimMimeTextHtmlTemplate) + else if (getMimeType() == MimeHtmlTemplate) { Blob data = getData(); diff --git a/src/zimlib/src/articlesearch.cpp b/src/zimlib/src/articlesearch.cpp index bb0575e..baeae89 100644 --- a/src/zimlib/src/articlesearch.cpp +++ b/src/zimlib/src/articlesearch.cpp @@ -43,7 +43,7 @@ namespace zim for (File::const_iterator it = articleFile.begin(); it != articleFile.end(); ++it) { - std::string title = it->getTitle().toUtf8(); + std::string title = it->getTitle(); if (title.find(expr) != std::string::npos) ret.push_back(*it); } diff --git a/src/zimlib/src/bunzip2stream.cpp b/src/zimlib/src/bunzip2stream.cpp index b5c19d6..68967f8 100644 --- a/src/zimlib/src/bunzip2stream.cpp +++ b/src/zimlib/src/bunzip2stream.cpp @@ -66,8 +66,6 @@ namespace zim Bunzip2StreamBuf::int_type Bunzip2StreamBuf::overflow(int_type c) { - log_debug("Bunzip2StreamBuf::overflow"); - if (pptr()) { // initialize input-stream for @@ -81,10 +79,8 @@ namespace zim stream.next_out = ibuffer(); stream.avail_out = ibuffer_size(); - log_debug("pre:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in); ret = ::BZ2_bzDecompress(&stream); checkError(ret, stream); - log_debug("post:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in << " ret=" << ret); // copy ibuffer to sinksource std::streamsize count = ibuffer_size() - stream.avail_out; @@ -118,14 +114,12 @@ namespace zim { // there is data already available // read compressed data from source into ibuffer - log_debug("in_avail=" << sinksource->in_avail()); stream.avail_in = sinksource->sgetn(ibuffer(), mymin(sinksource->in_avail(), ibuffer_size())); } else { // no data available stream.avail_in = sinksource->sgetn(ibuffer(), ibuffer_size()); - log_debug(stream.avail_in << " bytes read from source"); if (stream.avail_in == 0) return traits_type::eof(); } @@ -137,9 +131,7 @@ namespace zim // at least one character received from source - pass to decompressor - log_debug("pre:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in); int ret = ::BZ2_bzDecompress(&stream); - log_debug("post:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in << " ret=" << ret); checkError(ret, stream); diff --git a/src/zimlib/src/bzip2stream.cpp b/src/zimlib/src/bzip2stream.cpp index 579d640..0c66385 100644 --- a/src/zimlib/src/bzip2stream.cpp +++ b/src/zimlib/src/bzip2stream.cpp @@ -56,14 +56,11 @@ namespace zim Bzip2StreamBuf::~Bzip2StreamBuf() { - log_debug("bzCompressEnd"); ::BZ2_bzCompressEnd(&stream); } Bzip2StreamBuf::int_type Bzip2StreamBuf::overflow(int_type c) { - log_debug("Bzip2StreamBuf::overflow"); - // initialize input-stream stream.next_in = &obuffer[0]; stream.avail_in = pptr() - &obuffer[0]; @@ -74,9 +71,7 @@ namespace zim stream.avail_out = sizeof(zbuffer); // deflate - log_debug("pre:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in << " BZ_RUN"); - int ret = checkError(::BZ2_bzCompress(&stream, BZ_RUN), stream); - log_debug("post:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in << " ret=" << ret << " total_out_lo32=" << stream.total_out_lo32); + checkError(::BZ2_bzCompress(&stream, BZ_RUN), stream); // copy zbuffer to sink / consume deflated data std::streamsize count = sizeof(zbuffer) - stream.avail_out; @@ -106,8 +101,6 @@ namespace zim int Bzip2StreamBuf::sync() { - log_debug("Bzip2StreamBuf::sync"); - // initialize input-stream for stream.next_in = &obuffer[0]; stream.avail_in = pptr() - pbase(); @@ -119,9 +112,7 @@ namespace zim stream.next_out = zbuffer; stream.avail_out = sizeof(zbuffer); - log_debug("pre:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in << " BZ_FLUSH"); ret = checkError(::BZ2_bzCompress(&stream, BZ_FLUSH), stream); - log_debug("post:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in << " ret=" << ret << " total_out_lo32=" << stream.total_out_lo32); // copy zbuffer to sink std::streamsize count = sizeof(zbuffer) - stream.avail_out; @@ -141,8 +132,6 @@ namespace zim int Bzip2StreamBuf::end() { - log_debug("Bzip2StreamBuf::end"); - char zbuffer[8192]; // initialize input-stream for stream.next_in = &obuffer[0]; @@ -154,9 +143,7 @@ namespace zim stream.next_out = zbuffer; stream.avail_out = sizeof(zbuffer); - log_debug("pre:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in << " BZ_FINISH"); ret = checkError(::BZ2_bzCompress(&stream, BZ_FINISH), stream); - log_debug("post:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in << " ret=" << ret << " total_out_lo32=" << stream.total_out_lo32); // copy zbuffer to sink std::streamsize count = sizeof(zbuffer) - stream.avail_out; diff --git a/src/zimlib/src/cluster.cpp b/src/zimlib/src/cluster.cpp index 4f3cd28..73868a3 100644 --- a/src/zimlib/src/cluster.cpp +++ b/src/zimlib/src/cluster.cpp @@ -19,16 +19,33 @@ #include #include +#include +#include #include + #include "log.h" + +#include "config.h" + +#ifdef ENABLE_ZLIB #include #include +#endif + +#ifdef ENABLE_BZIP2 #include #include -#include +#endif + +#ifdef ENABLE_LZMA +#include +#include +#endif log_define("zim.cluster") +#define log_debug1(e) + namespace zim { Cluster::Cluster() @@ -50,7 +67,7 @@ namespace zim void ClusterImpl::read(std::istream& in) { - log_debug("read"); + log_debug1("read"); // read first offset, which specifies, how many offsets we need to read size_type offset; @@ -63,7 +80,7 @@ namespace zim size_type n = offset / 4; size_type a = offset; - log_debug("first offset is " << offset << " n=" << n << " a=" << a); + log_debug1("first offset is " << offset << " n=" << n << " a=" << a); // read offsets offsets.clear(); @@ -75,11 +92,11 @@ namespace zim in.read(reinterpret_cast(&offset), sizeof(offset)); if (in.fail()) { - log_debug("fail at " << n); + log_debug1("fail at " << n); return; } offset = fromLittleEndian(&offset); - log_debug("offset=" << offset << '(' << offset-a << ')'); + log_debug1("offset=" << offset << '(' << offset-a << ')'); offsets.push_back(offset - a); } @@ -88,7 +105,7 @@ namespace zim { n = offsets.back() - offsets.front(); data.resize(n); - log_debug("read " << n << " bytes of data"); + log_debug1("read " << n << " bytes of data"); in.read(&(data[0]), n); } } @@ -109,12 +126,9 @@ namespace zim void ClusterImpl::addBlob(const Blob& blob) { - log_debug("addBlob(ptr, " << blob.size() << ')'); + log_debug1("addBlob(ptr, " << blob.size() << ')'); data.insert(data.end(), blob.data(), blob.end()); offsets.push_back(data.size()); - - for (unsigned n = 0; n < offsets.size(); ++n) - log_debug("offset[" << n << "]=" << offsets[n]); } Blob ClusterImpl::getBlob(size_type n) const @@ -141,6 +155,8 @@ namespace zim std::istream& operator>> (std::istream& in, ClusterImpl& clusterImpl) { + log_trace("read cluster"); + char c; in.get(c); clusterImpl.setCompression(static_cast(c)); @@ -154,22 +170,42 @@ namespace zim case zimcompZip: { +#ifdef ENABLE_ZLIB log_debug("uncompress data (zlib)"); zim::InflateStream is(in); + is.exceptions(std::ios::failbit | std::ios::badbit); clusterImpl.read(is); +#else + throw std::runtime_error("zlib not enabled in this library"); +#endif break; } case zimcompBzip2: { +#ifdef ENABLE_BZIP2 log_debug("uncompress data (bzip2)"); zim::Bunzip2Stream is(in); + is.exceptions(std::ios::failbit | std::ios::badbit); clusterImpl.read(is); +#else + throw std::runtime_error("bzip2 not enabled in this library"); +#endif break; } case zimcompLzma: - throw std::runtime_error("lzma decompression is not implemented"); + { +#ifdef ENABLE_LZMA + log_debug("uncompress data (lzma)"); + zim::UnlzmaStream is(in); + is.exceptions(std::ios::failbit | std::ios::badbit); + clusterImpl.read(is); +#else + throw std::runtime_error("lzma not enabled in this library"); +#endif + break; + } default: log_error("invalid compression flag " << c); @@ -187,6 +223,8 @@ namespace zim std::ostream& operator<< (std::ostream& out, const ClusterImpl& clusterImpl) { + log_trace("write cluster"); + out.put(static_cast(clusterImpl.getCompression())); switch(clusterImpl.getCompression()) @@ -198,24 +236,65 @@ namespace zim case zimcompZip: { +#ifdef ENABLE_ZLIB log_debug("compress data (zlib)"); zim::DeflateStream os(out); + os.exceptions(std::ios::failbit | std::ios::badbit); clusterImpl.write(os); os.flush(); +#else + throw std::runtime_error("zlib not enabled in this library"); +#endif break; } case zimcompBzip2: { +#ifdef ENABLE_BZIP2 log_debug("compress data (bzip2)"); zim::Bzip2Stream os(out); + os.exceptions(std::ios::failbit | std::ios::badbit); clusterImpl.write(os); os.end(); +#else + throw std::runtime_error("bzip2 not enabled in this library"); +#endif break; } case zimcompLzma: - throw std::runtime_error("lzma compression is not implemented"); + { +#ifdef ENABLE_LZMA + uint32_t lzmaPreset = 3 | LZMA_PRESET_EXTREME; + /** + * read lzma preset from environment + * ZIM_LZMA_PRESET is a number followed optionally by a + * suffix 'e'. The number gives the preset and the suffix tells, + * if LZMA_PRESET_EXTREME should be set. + * e.g.: + * ZIM_LZMA_LEVEL=9 => 9 + * ZIM_LZMA_LEVEL=3e => 3 + extreme + */ + const char* e = ::getenv("ZIM_LZMA_LEVEL"); + if (e) + { + char flag = '\0'; + std::istringstream s(e); + s >> lzmaPreset >> flag; + if (flag == 'e') + lzmaPreset |= LZMA_PRESET_EXTREME; + } + + log_debug("compress data (lzma, " << std::hex << lzmaPreset << ")"); + zim::LzmaStream os(out, lzmaPreset); + os.exceptions(std::ios::failbit | std::ios::badbit); + clusterImpl.write(os); + os.end(); +#else + throw std::runtime_error("lzma not enabled in this library"); +#endif + break; + } default: std::ostringstream msg; diff --git a/src/zimlib/src/config.h b/src/zimlib/src/config.h new file mode 100644 index 0000000..49d23c5 --- /dev/null +++ b/src/zimlib/src/config.h @@ -0,0 +1,99 @@ +/* src/zimlib/src/config.h. Generated from config.h.in by configure. */ +/* src/zimlib/src/config.h.in. Generated from configure.ac by autoheader. */ + +/* set zim cluster cache size to number of cached chunks */ +#define CLUSTER_CACHE_SIZE 16 + +/* set zim dirent cache size to number of cached chunks */ +#define DIRENT_CACHE_SIZE 51200 + +/* defined if bzip2 compression is enabled */ +#define ENABLE_BZIP2 1 + +/* defined if lzma compression is enabled */ +#define ENABLE_LZMA 1 + +/* defined if zlib compression is enabled */ +#define ENABLE_ZLIB 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_DLFCN_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +/* Define to 1 if you have the `bz2' library (-lbz2). */ +#define HAVE_LIBBZ2 1 + +/* Define to 1 if you have the `lzma' library (-llzma). */ +#define HAVE_LIBLZMA 1 + +/* Define to 1 if you have the `microhttpd' library (-lmicrohttpd). */ +#define HAVE_LIBMICROHTTPD 1 + +/* Define to 1 if you have the `unac' library (-lunac). */ +#define HAVE_LIBUNAC 1 + +/* Define to 1 if you have the `z' library (-lz). */ +#define HAVE_LIBZ 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* Define to 1 if you have the `stat64' function. */ +#define HAVE_STAT64 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* Define to the sub-directory in which libtool stores uninstalled libraries. + */ +#define LT_OBJDIR ".libs/" + +/* set lzma uncompress memory size to number of MB */ +#define LZMA_MEMORY_SIZE 128 + +/* Name of package */ +#define PACKAGE "kiwix" + +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "kiwix" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "kiwix 0.9" + +/* Define to the one symbol short name of this package. */ +#define PACKAGE_TARNAME "kiwix" + +/* Define to the home page for this package. */ +#define PACKAGE_URL "" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "0.9" + +/* Define to 1 if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* Version number of package */ +#define VERSION "0.9" diff --git a/src/zimlib/src/config.h.in b/src/zimlib/src/config.h.in index 6a69a39..fda56ee 100644 --- a/src/zimlib/src/config.h.in +++ b/src/zimlib/src/config.h.in @@ -6,6 +6,15 @@ /* set zim dirent cache size to number of cached chunks */ #undef DIRENT_CACHE_SIZE +/* defined if bzip2 compression is enabled */ +#undef ENABLE_BZIP2 + +/* defined if lzma compression is enabled */ +#undef ENABLE_LZMA + +/* defined if zlib compression is enabled */ +#undef ENABLE_ZLIB + /* Define to 1 if you have the header file. */ #undef HAVE_DLFCN_H @@ -15,6 +24,9 @@ /* Define to 1 if you have the `bz2' library (-lbz2). */ #undef HAVE_LIBBZ2 +/* Define to 1 if you have the `lzma' library (-llzma). */ +#undef HAVE_LIBLZMA + /* Define to 1 if you have the `microhttpd' library (-lmicrohttpd). */ #undef HAVE_LIBMICROHTTPD @@ -55,6 +67,9 @@ */ #undef LT_OBJDIR +/* set lzma uncompress memory size to number of MB */ +#undef LZMA_MEMORY_SIZE + /* Name of package */ #undef PACKAGE diff --git a/src/zimlib/src/deflatestream.cpp b/src/zimlib/src/deflatestream.cpp index e9ad3cf..3ef8f76 100644 --- a/src/zimlib/src/deflatestream.cpp +++ b/src/zimlib/src/deflatestream.cpp @@ -70,8 +70,6 @@ namespace zim DeflateStreamBuf::int_type DeflateStreamBuf::overflow(int_type c) { - log_debug("DeflateStreamBuf::overflow"); - // initialize input-stream stream.next_in = reinterpret_cast(&obuffer[0]); stream.avail_in = pptr() - &obuffer[0]; @@ -82,9 +80,7 @@ namespace zim stream.avail_out = sizeof(zbuffer); // deflate - log_debug("pre:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in); checkError(::deflate(&stream, Z_NO_FLUSH), stream); - log_debug("post:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in); // copy zbuffer to sink / consume deflated data std::streamsize count = sizeof(zbuffer) - stream.avail_out; @@ -114,8 +110,6 @@ namespace zim int DeflateStreamBuf::sync() { - log_debug("DeflateStreamBuf::sync"); - // initialize input-stream for stream.next_in = reinterpret_cast(&obuffer[0]); stream.avail_in = pptr() - pbase(); @@ -126,9 +120,7 @@ namespace zim stream.next_out = (Bytef*)zbuffer; stream.avail_out = sizeof(zbuffer); - log_debug("pre:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in); checkError(::deflate(&stream, Z_SYNC_FLUSH), stream); - log_debug("post:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in); // copy zbuffer to sink std::streamsize count = sizeof(zbuffer) - stream.avail_out; @@ -157,9 +149,7 @@ namespace zim stream.next_out = (Bytef*)zbuffer; stream.avail_out = sizeof(zbuffer); - log_debug("pre:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in); int ret = checkError(::deflate(&stream, Z_FINISH), stream); - log_debug("post:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in); // copy zbuffer to sink std::streamsize count = sizeof(zbuffer) - stream.avail_out; diff --git a/src/zimlib/src/dirent.cpp b/src/zimlib/src/dirent.cpp index d95e9d7..0da44d8 100644 --- a/src/zimlib/src/dirent.cpp +++ b/src/zimlib/src/dirent.cpp @@ -35,33 +35,35 @@ namespace zim { union { - char d[12]; + char d[16]; long a; } header; - header.d[0] = static_cast(dirent.isRedirect()); - header.d[1] = static_cast(dirent.getMimeType()); - header.d[2] = '\0'; + toLittleEndian(dirent.getMimeType(), header.d); + header.d[2] = static_cast(dirent.getParameter().size()); header.d[3] = dirent.getNamespace(); - log_debug("title=" << dirent.getTitle() << " title.size()=" << dirent.getTitle().getValue().size() << " extralen=" << dirent.getExtraLen()); + log_debug("title=" << dirent.getTitle() << " title.size()=" << dirent.getTitle().size()); + + toLittleEndian(dirent.getVersion(), header.d + 4); if (dirent.isRedirect()) { - toLittleEndian(dirent.getRedirectIndex(), header.d + 4); - toLittleEndian(dirent.getExtraLen(), header.d + 8); - out.write(header.d, 10); + toLittleEndian(dirent.getRedirectIndex(), header.d + 8); + out.write(header.d, 12); } else { - toLittleEndian(dirent.getClusterNumber(), header.d + 4); - toLittleEndian(dirent.getBlobNumber(), header.d + 8); - toLittleEndian(dirent.getExtraLen(), header.d + 12); - out.write(header.d, 14); + toLittleEndian(dirent.getClusterNumber(), header.d + 8); + toLittleEndian(dirent.getBlobNumber(), header.d + 12); + out.write(header.d, 16); } - out << dirent.getTitle().getValue(); - if (!dirent.getParameter().empty()) - out << '\0' << dirent.getParameter(); + out << dirent.getUrl() << '\0'; + + std::string t = dirent.getTitle(); + if (t != dirent.getUrl()) + out << t; + out << '\0' << dirent.getParameter(); return out; } @@ -71,34 +73,34 @@ namespace zim union { long a; - char d[14]; + char d[16]; } header; - in.read(header.d, 10); + in.read(header.d, 12); if (in.fail()) { log_warn("error reading dirent header"); return in; } - if (in.gcount() != 10) + if (in.gcount() != 12) { log_warn("error reading dirent header (2)"); in.setstate(std::ios::failbit); return in; } - bool redirect = header.d[0]; + uint16_t mimeType = fromLittleEndian(reinterpret_cast(header.d)); + bool redirect = (mimeType == std::numeric_limits::max()); char ns = header.d[3]; - size_type extraLen; + size_type version = fromLittleEndian(reinterpret_cast(header.d + 4)); + dirent.setVersion(version); + if (redirect) { - log_debug("read redirect entry"); + size_type redirectIndex = fromLittleEndian(reinterpret_cast(header.d + 8)); - size_type redirectIndex = fromLittleEndian(reinterpret_cast(header.d + 4)); - extraLen = fromLittleEndian(reinterpret_cast(header.d + 8)); - - log_debug("redirectIndex=" << redirectIndex << " extraLen=" << extraLen); + log_debug("redirectIndex=" << redirectIndex); dirent.setRedirect(redirectIndex); } @@ -106,7 +108,7 @@ namespace zim { log_debug("read article entry"); - in.read(header.d + 10, 4); + in.read(header.d + 12, 4); if (in.fail()) { log_warn("error reading article dirent header"); @@ -116,56 +118,48 @@ namespace zim if (in.gcount() != 4) { log_warn("error reading article dirent header (2)"); - return in; in.setstate(std::ios::failbit); return in; } - MimeType mimeType = static_cast(header.d[1]); - size_type clusterNumber = fromLittleEndian(reinterpret_cast(header.d + 4)); - size_type blobNumber = fromLittleEndian(reinterpret_cast(header.d + 8)); - extraLen = fromLittleEndian(reinterpret_cast(header.d + 12)); + size_type clusterNumber = fromLittleEndian(reinterpret_cast(header.d + 8)); + size_type blobNumber = fromLittleEndian(reinterpret_cast(header.d + 12)); - log_debug("mimeType=" << mimeType << " clusterNumber=" << clusterNumber << " blobNumber=" << blobNumber << " extraLen=" << extraLen); + log_debug("mimeType=" << mimeType << " clusterNumber=" << clusterNumber << " blobNumber=" << blobNumber); dirent.setArticle(mimeType, clusterNumber, blobNumber); } char ch; + std::string url; std::string title; std::string parameter; - log_debug("read title and parameters; extraLen=" << extraLen); + log_debug("read url, title and parameters"); - title.reserve(extraLen); - while (extraLen && in.get(ch) && ch != '\0') - { + while (in.get(ch) && ch != '\0') + url += ch; + + while (in.get(ch) && ch != '\0') title += ch; - --extraLen; - } - if (in && extraLen) - { - --extraLen; - parameter.reserve(extraLen); - while (extraLen-- && in.get(ch)) - parameter += ch; - } + uint8_t extraLen = static_cast(header.d[2]); + while (extraLen-- > 0 && in.get(ch)) + parameter += ch; - dirent.setTitle(ns, QUnicodeString(title)); + dirent.setUrl(ns, url); + dirent.setTitle(title); dirent.setParameter(parameter); return in; } - QUnicodeString Dirent::getUrl() const + std::string Dirent::getLongUrl() const { - log_trace("Dirent::getUrl()"); + log_trace("Dirent::getLongUrl()"); + log_debug("namespace=" << getNamespace() << " title=" << getTitle()); - log_debug("namespace=" << getNamespace()); - log_debug("title=" << getTitle()); - - return QUnicodeString(std::string(1, getNamespace()) + '/' + getTitle().getValue()); + return std::string(1, getNamespace()) + '/' + getUrl(); } } diff --git a/src/zimlib/src/envvalue.cpp b/src/zimlib/src/envvalue.cpp new file mode 100644 index 0000000..1d5c64f --- /dev/null +++ b/src/zimlib/src/envvalue.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include + +namespace zim +{ + unsigned envValue(const char* env, unsigned def) + { + const char* v = ::getenv(env); + if (v) + { + std::istringstream s(v); + s >> def; + } + return def; + } + + unsigned envMemSize(const char* env, unsigned def) + { + const char* v = ::getenv(env); + if (v) + { + char unit = '\0'; + std::istringstream s(v); + s >> def >> unit; + + switch (unit) + { + case 'k': + case 'K': def *= 1024; break; + case 'm': + case 'M': def *= 1024 * 1024; break; + case 'g': + case 'G': def *= 1024 * 1024 * 1024; break; + } + } + return def; + } +} + diff --git a/src/zimlib/src/envvalue.h b/src/zimlib/src/envvalue.h new file mode 100644 index 0000000..d6dffd4 --- /dev/null +++ b/src/zimlib/src/envvalue.h @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_ENVVALUE_H +#define ZIM_ENVVALUE_H + +namespace zim +{ + unsigned envValue(const char* env, unsigned def); + unsigned envMemSize(const char* env, unsigned def); +} + +#endif // ZIM_ENVVALUE_H diff --git a/src/zimlib/src/file.cpp b/src/zimlib/src/file.cpp index c989ce1..c349747 100644 --- a/src/zimlib/src/file.cpp +++ b/src/zimlib/src/file.cpp @@ -26,22 +26,27 @@ log_define("zim.file") namespace zim { - Dirent File::getDirent(size_type idx) - { - log_trace("File::getDirent(" << idx << ')'); - - return impl->getDirent(idx); - } - Article File::getArticle(size_type idx) const { return Article(*this, idx); } - Article File::getArticle(char ns, const QUnicodeString& title, bool collate) + Article File::getArticle(char ns, const std::string& url) { - log_trace("File::getArticle('" << ns << "', \"" << title << "\", " << collate << ')'); - std::pair r = findx(ns, title, collate); + log_trace("File::getArticle('" << ns << "', \"" << url << ')'); + std::pair r = findx(ns, url); + return r.first ? *r.second : Article(); + } + + Article File::getArticleByTitle(size_type idx) + { + return Article(*this, impl->getIndexByTitle(idx)); + } + + Article File::getArticleByTitle(char ns, const std::string& title) + { + log_trace("File::getArticleByTitle('" << ns << "', \"" << title << ')'); + std::pair r = findxByTitle(ns, title); return r.first ? *r.second : Article(); } @@ -54,12 +59,15 @@ namespace zim File::const_iterator File::begin() { return const_iterator(this, 0); } + File::const_iterator File::beginByTitle() + { return const_iterator(this, 0, const_iterator::ArticleIterator); } + File::const_iterator File::end() { return const_iterator(this, getCountArticles()); } - std::pair File::findx(char ns, const QUnicodeString& title, bool collate) + std::pair File::findx(char ns, const std::string& url) { - log_debug("find article " << ns << " \"" << title << "\", " << collate << " in file \"" << getFilename() << '"'); + log_debug("find article by url " << ns << " \"" << url << "\", in file \"" << getFilename() << '"'); size_type l = getNamespaceBeginOffset(ns); size_type u = getNamespaceEndOffset(ns); @@ -79,8 +87,8 @@ namespace zim int c = ns < d.getNamespace() ? -1 : ns > d.getNamespace() ? 1 - : (collate ? title.compareCollate(QUnicodeString(d.getTitle())) - : title.compare(QUnicodeString(d.getTitle()))); + : url.compare(d.getUrl()); + if (c < 0) u = p; else if (c > 0) @@ -93,20 +101,70 @@ namespace zim } Dirent d = getDirent(l); - int c = collate ? title.compareCollate(QUnicodeString(d.getTitle())) - : title.compare(QUnicodeString(d.getTitle())); + int c = url.compare(d.getUrl()); + if (c == 0) { log_debug("article found after " << itcount << " iterations in file \"" << getFilename() << "\" at index " << l); return std::pair(true, const_iterator(this, l)); } - log_debug("article not found after " << itcount << " iterations (\"" << d.getTitle() << "\" does not match)"); + log_debug("article not found after " << itcount << " iterations (\"" << d.getUrl() << "\" does not match)"); return std::pair(false, const_iterator(this, u)); } - File::const_iterator File::find(char ns, const QUnicodeString& title, bool collate) + std::pair File::findxByTitle(char ns, const std::string& title) { - return findx(ns, title, collate).second; + log_debug("find article by title " << ns << " \"" << title << "\", in file \"" << getFilename() << '"'); + + size_type l = getNamespaceBeginOffset(ns); + size_type u = getNamespaceEndOffset(ns); + + if (l == u) + { + log_debug("namespace " << ns << " not found"); + return std::pair(false, end()); + } + + unsigned itcount = 0; + while (u - l > 1) + { + ++itcount; + size_type p = l + (u - l) / 2; + Dirent d = getDirentByTitle(p); + + int c = ns < d.getNamespace() ? -1 + : ns > d.getNamespace() ? 1 + : title.compare(d.getTitle()); + + if (c < 0) + u = p; + else if (c > 0) + l = p; + else + { + log_debug("article found after " << itcount << " iterations in file \"" << getFilename() << "\" at index " << p); + return std::pair(true, const_iterator(this, p, const_iterator::ArticleIterator)); + } + } + + Dirent d = getDirentByTitle(l); + int c = title.compare(d.getTitle()); + + if (c == 0) + { + log_debug("article found after " << itcount << " iterations in file \"" << getFilename() << "\" at index " << l); + return std::pair(true, const_iterator(this, l, const_iterator::ArticleIterator)); + } + + log_debug("article not found after " << itcount << " iterations (\"" << d.getTitle() << "\" does not match)"); + return std::pair(false, const_iterator(this, u, const_iterator::ArticleIterator)); } + + File::const_iterator File::find(char ns, const std::string& url) + { return findx(ns, url).second; } + + File::const_iterator File::findByTitle(char ns, const std::string& title) + { return findxByTitle(ns, title).second; } + } diff --git a/src/zimlib/src/fileheader.cpp b/src/zimlib/src/fileheader.cpp index 0033a19..521d12e 100644 --- a/src/zimlib/src/fileheader.cpp +++ b/src/zimlib/src/fileheader.cpp @@ -27,34 +27,36 @@ log_define("zim.file.header") namespace zim { const size_type Fileheader::zimMagic = 0x044d495a; // ="ZIM^d" - const size_type Fileheader::zimVersion = 4; - const size_type Fileheader::size = 56; + const size_type Fileheader::zimVersion = 5; + const size_type Fileheader::size = 72; std::ostream& operator<< (std::ostream& out, const Fileheader& fh) { - char header[56]; + char header[Fileheader::size]; toLittleEndian(Fileheader::zimMagic, header); toLittleEndian(Fileheader::zimVersion, header + 4); std::copy(fh.getUuid().data, fh.getUuid().data + sizeof(Uuid), header + 8); toLittleEndian(fh.getArticleCount(), header + 24); - toLittleEndian(fh.getIndexPtrPos(), header + 28); - toLittleEndian(fh.getClusterCount(), header + 36); - toLittleEndian(fh.getClusterPtrPos(), header + 40); - toLittleEndian(fh.getMainPage(), header + 48); - toLittleEndian(fh.getLayoutPage(), header + 52); + toLittleEndian(fh.getClusterCount(), header + 28); + toLittleEndian(fh.getUrlPtrPos(), header + 32); + toLittleEndian(fh.getTitleIdxPos(), header + 40); + toLittleEndian(fh.getClusterPtrPos(), header + 48); + toLittleEndian(fh.getMimeListPos(), header + 56); + toLittleEndian(fh.getMainPage(), header + 64); + toLittleEndian(fh.getLayoutPage(), header + 68); - out.write(header, 56); + out.write(header, Fileheader::size); return out; } std::istream& operator>> (std::istream& in, Fileheader& fh) { - char header[56]; - in.read(header, 56); + char header[Fileheader::size]; + in.read(header, Fileheader::size); if (in.fail()) return in; - if (in.gcount() != 56) + if (static_cast(in.gcount()) != Fileheader::size) { in.setstate(std::ios::failbit); return in; @@ -69,8 +71,8 @@ namespace zim return in; } - size_type version = fromLittleEndian(reinterpret_cast(header + 4)); - if (version != Fileheader::zimVersion) + uint16_t version = fromLittleEndian(reinterpret_cast(header + 4)); + if (version != static_cast(Fileheader::zimVersion)) { log_error("invalid zimfile version " << version << " found - " << Fileheader::zimVersion << " expected"); @@ -81,17 +83,21 @@ namespace zim Uuid uuid; std::copy(header + 8, header + 24, uuid.data); size_type articleCount = fromLittleEndian(reinterpret_cast(header + 24)); - offset_type indexPtrPos = fromLittleEndian(reinterpret_cast(header + 28)); - size_type blobCount = fromLittleEndian(reinterpret_cast(header + 36)); - offset_type blobPtrPos = fromLittleEndian(reinterpret_cast(header + 40)); - size_type mainPage = fromLittleEndian(reinterpret_cast(header + 48)); - size_type layoutPage = fromLittleEndian(reinterpret_cast(header + 52)); + size_type clusterCount = fromLittleEndian(reinterpret_cast(header + 28)); + offset_type urlPtrPos = fromLittleEndian(reinterpret_cast(header + 32)); + offset_type titleIdxPos = fromLittleEndian(reinterpret_cast(header + 40)); + offset_type clusterPtrPos = fromLittleEndian(reinterpret_cast(header + 48)); + offset_type mimeListPos = fromLittleEndian(reinterpret_cast(header + 56)); + size_type mainPage = fromLittleEndian(reinterpret_cast(header + 64)); + size_type layoutPage = fromLittleEndian(reinterpret_cast(header + 68)); fh.setUuid(uuid); fh.setArticleCount(articleCount); - fh.setIndexPtrPos(indexPtrPos); - fh.setClusterCount(blobCount); - fh.setClusterPtrPos(blobPtrPos); + fh.setClusterCount(clusterCount); + fh.setUrlPtrPos(urlPtrPos); + fh.setTitleIdxPos(titleIdxPos); + fh.setClusterPtrPos(clusterPtrPos); + fh.setMimeListPos(mimeListPos); fh.setMainPage(mainPage); fh.setLayoutPage(layoutPage); diff --git a/src/zimlib/src/fileimpl.cpp b/src/zimlib/src/fileimpl.cpp index 97d9870..02fb6d9 100644 --- a/src/zimlib/src/fileimpl.cpp +++ b/src/zimlib/src/fileimpl.cpp @@ -24,11 +24,11 @@ #include #include #include -#include #include #include #include "config.h" #include "log.h" +#include "envvalue.h" #ifdef WITH_CXXTOOLS # include @@ -38,20 +38,6 @@ log_define("zim.file.impl") namespace zim { - namespace - { - unsigned envValue(const char* env, unsigned def) - { - const char* v = ::getenv(env); - if (v) - { - std::istringstream s(v); - s >> def; - } - return def; - } - } - ////////////////////////////////////////////////////////////////////// // FileImpl // @@ -60,6 +46,8 @@ namespace zim direntCache(envValue("ZIM_DIRENTCACHE", DIRENT_CACHE_SIZE)), clusterCache(envValue("ZIM_CLUSTERCACHE", CLUSTER_CACHE_SIZE)) { + log_trace("read file \"" << fname << '"'); + if (!zimFile) throw ZimFileFormatError(std::string("can't open zim-file \"") + fname + '"'); @@ -89,55 +77,41 @@ namespace zim if (zimFile.fail()) throw ZimFileFormatError("error reading zim-file header"); - // read index offsets - { - size_type indexOffsetsSize = header.getArticleCount() * sizeof(OffsetsType::value_type); - log_debug("read " << indexOffsetsSize << " bytes indexptr"); - zimFile.seekg(header.getIndexPtrPos()); - indexOffsets.resize(header.getArticleCount()); - zimFile.read(reinterpret_cast(&indexOffsets[0]), indexOffsetsSize); - } - - if (isBigEndian()) - { - for (OffsetsType::iterator it = indexOffsets.begin(); it != indexOffsets.end(); ++it) - *it = fromLittleEndian(&*it); - } - - // read cluster offsets - { - size_type clusterOffsetsSize = header.getClusterCount() * sizeof(OffsetsType::value_type); - log_debug("read " << clusterOffsetsSize << " bytes clusterptr"); - zimFile.seekg(header.getClusterPtrPos()); - clusterOffsets.resize(header.getClusterCount()); - zimFile.read(reinterpret_cast(&clusterOffsets[0]), clusterOffsetsSize); - } - - if (isBigEndian()) - { - for (OffsetsType::iterator it = clusterOffsets.begin(); it != clusterOffsets.end(); ++it) - *it = fromLittleEndian(&*it); - } - - if (clusterOffsets.empty()) + if (getCountClusters() == 0) log_warn("no clusters found"); else { - offset_type lastOffset = clusterOffsets.back(); + offset_type lastOffset = getClusterOffset(getCountClusters() - 1); log_debug("last offset=" << lastOffset << " file size=" << st.st_size); - if (lastOffset > st.st_size) + if (lastOffset > static_cast(st.st_size)) { log_fatal("last offset (" << lastOffset << ") larger than file size (" << st.st_size << ')'); throw ZimFileFormatError("last cluster offset larger than file size; file corrupt"); } } + + // read mime types + zimFile.seekg(header.getMimeListPos()); + std::string mimeType; + while (true) + { + std::getline(zimFile, mimeType, '\0'); + + if (zimFile.fail()) + throw ZimFileFormatError("error reading mime type list"); + + if (mimeType.empty()) + break; + + mimeTypes.push_back(mimeType);; + } } Dirent FileImpl::getDirent(size_type idx) { log_trace("FileImpl::getDirent(" << idx << ')'); - if (idx >= indexOffsets.size()) + if (idx >= getCountArticles()) throw ZimFileFormatError("article index out of range"); if (!zimFile) @@ -155,7 +129,9 @@ namespace zim log_debug("dirent " << idx << " not found in cache; hits " << direntCache.getHits() << " misses " << direntCache.getMisses() << " ratio " << direntCache.hitRatio() * 100 << "% fillfactor " << direntCache.fillfactor()); - zimFile.seekg(indexOffsets[idx]); + offset_type indexOffset = getOffset(header.getUrlPtrPos(), idx); + + zimFile.seekg(indexOffset); if (!zimFile) { log_warn("failed to seek to directory entry"); @@ -171,18 +147,43 @@ namespace zim throw ZimFileFormatError("failed to read directory entry"); } - log_debug("dirent read from " << indexOffsets[idx]); + log_debug("dirent read from " << indexOffset); direntCache.put(idx, dirent); return dirent; } + Dirent FileImpl::getDirentByTitle(size_type idx) + { + if (idx >= getCountArticles()) + throw ZimFileFormatError("article index out of range"); + return getDirent(getIndexByTitle(idx)); + } + + size_type FileImpl::getIndexByTitle(size_type idx) + { + if (idx >= getCountArticles()) + throw ZimFileFormatError("article index out of range"); + + zimFile.seekg(header.getTitleIdxPos() + sizeof(size_type) * idx); + size_type ret; + zimFile.read(reinterpret_cast(&ret), sizeof(size_type)); + + if (!zimFile) + throw ZimFileFormatError("error reading title index"); + + if (isBigEndian()) + ret = fromLittleEndian(&ret); + + return ret; + } + Cluster FileImpl::getCluster(size_type idx) { log_trace("getCluster(" << idx << ')'); - if (idx >= clusterOffsets.size()) - throw ZimFileFormatError("article index out of range"); + if (idx >= getCountClusters()) + throw ZimFileFormatError("cluster index out of range"); Cluster cluster = clusterCache.get(idx); if (cluster) @@ -191,8 +192,9 @@ namespace zim return cluster; } - log_debug("read cluster " << idx << " from offset " << clusterOffsets[idx]); - zimFile.seekg(clusterOffsets[idx]); + offset_type clusterOffset = getClusterOffset(idx); + log_debug("read cluster " << idx << " from offset " << clusterOffset); + zimFile.seekg(clusterOffset); zimFile >> cluster; if (zimFile.fail()) @@ -209,6 +211,21 @@ namespace zim return cluster; } + offset_type FileImpl::getOffset(offset_type ptrOffset, size_type idx) + { + zimFile.seekg(ptrOffset + sizeof(offset_type) * idx); + offset_type offset; + zimFile.read(reinterpret_cast(&offset), sizeof(offset_type)); + + if (!zimFile) + throw ZimFileFormatError("error reading offset"); + + if (isBigEndian()) + offset = fromLittleEndian(&offset); + + return offset; + } + size_type FileImpl::getNamespaceBeginOffset(char ch) { log_trace("getNamespaceBeginOffset(" << ch << ')'); @@ -282,4 +299,16 @@ namespace zim return namespaces; } + const std::string& FileImpl::getMimeType(uint16_t idx) const + { + if (idx > mimeTypes.size()) + { + std::ostringstream msg; + msg << "unknown mime type code " << idx; + throw std::runtime_error(msg.str()); + } + + return mimeTypes[idx]; + } + } diff --git a/src/zimlib/src/indexarticle.cpp b/src/zimlib/src/indexarticle.cpp index 11e0519..992c0b2 100644 --- a/src/zimlib/src/indexarticle.cpp +++ b/src/zimlib/src/indexarticle.cpp @@ -48,7 +48,7 @@ namespace zim void IndexArticle::readEntriesZ() { std::istringstream s(getParameter()); - zim::IZIntStream extra(s); + zim::ZIntStream extra(s); unsigned flagfield; // field with one bit (bits 0-3) for each cateogry extra.get(flagfield); @@ -84,7 +84,7 @@ namespace zim log_debug("read data from offset " << offset << " len " << len); zim::Blob b = getData(); ptrstream data(const_cast(b.data() + offset), const_cast(b.data() + offset + len)); - IZIntStream zdata(data); + ZIntStream zdata(data); unsigned index; unsigned indexOffset = 0; diff --git a/src/zimlib/src/inflatestream.cpp b/src/zimlib/src/inflatestream.cpp index ac7e951..9fa4bdc 100644 --- a/src/zimlib/src/inflatestream.cpp +++ b/src/zimlib/src/inflatestream.cpp @@ -67,8 +67,6 @@ namespace zim InflateStreamBuf::int_type InflateStreamBuf::overflow(int_type c) { - log_debug("InflateStreamBuf::overflow"); - if (pptr()) { // initialize input-stream for @@ -82,10 +80,8 @@ namespace zim stream.next_out = (Bytef*)ibuffer(); stream.avail_out = ibuffer_size(); - log_debug("pre:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in); ret = ::inflate(&stream, Z_SYNC_FLUSH); checkError(ret, stream); - log_debug("post:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in << " ret=" << ret); // copy zbuffer to sinksource std::streamsize count = ibuffer_size() - stream.avail_out; @@ -119,14 +115,12 @@ namespace zim { // there is data already available // read compressed data from source into ibuffer - log_debug("in_avail=" << sinksource->in_avail()); stream.avail_in = sinksource->sgetn(ibuffer(), std::min(sinksource->in_avail(), ibuffer_size())); } else { // no data available stream.avail_in = sinksource->sgetn(ibuffer(), ibuffer_size()); - log_debug(stream.avail_in << " bytes read from source"); if (stream.avail_in == 0) return traits_type::eof(); } @@ -138,9 +132,7 @@ namespace zim // at least one character received from source - pass to decompressor - log_debug("pre:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in); int ret = ::inflate(&stream, Z_SYNC_FLUSH); - log_debug("post:avail_out=" << stream.avail_out << " avail_in=" << stream.avail_in << " ret=" << ret); checkError(ret, stream); diff --git a/src/zimlib/src/lzmastream.cpp b/src/zimlib/src/lzmastream.cpp new file mode 100644 index 0000000..f1e7689 --- /dev/null +++ b/src/zimlib/src/lzmastream.cpp @@ -0,0 +1,181 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include "log.h" +#include +#include + +log_define("zim.lzma.compress") + +namespace zim +{ + namespace + { + lzma_ret checkError(lzma_ret ret) + { + if (ret != LZMA_OK && ret != LZMA_STREAM_END) + { + std::ostringstream msg; + msg << "lzma-error " << ret; + switch (ret) + { + case LZMA_OK: msg << ": LZMA_OK"; break; + case LZMA_STREAM_END: msg << ": LZMA_STREAM_END"; break; + case LZMA_NO_CHECK: msg << ": LZMA_NO_CHECK"; break; + case LZMA_UNSUPPORTED_CHECK: msg << ": LZMA_UNSUPPORTED_CHECK"; break; + case LZMA_GET_CHECK: msg << ": LZMA_GET_CHECK"; break; + case LZMA_MEM_ERROR: msg << ": LZMA_MEM_ERROR"; break; + case LZMA_MEMLIMIT_ERROR: msg << ": LZMA_MEMLIMIT_ERROR"; break; + case LZMA_FORMAT_ERROR: msg << ": LZMA_FORMAT_ERROR"; break; + case LZMA_OPTIONS_ERROR: msg << ": LZMA_OPTIONS_ERROR"; break; + case LZMA_DATA_ERROR: msg << ": LZMA_DATA_ERROR"; break; + case LZMA_BUF_ERROR: msg << ": LZMA_BUF_ERROR"; break; + case LZMA_PROG_ERROR: msg << ": LZMA_PROG_ERROR"; break; + } + log_error(msg.str()); + throw LzmaError(ret, msg.str()); + } + return ret; + } + } + + LzmaStreamBuf::LzmaStreamBuf(std::streambuf* sink_, uint32_t preset, lzma_check check, unsigned bufsize_) + : obuffer(bufsize_), + sink(sink_) + { + std::memset(reinterpret_cast(&stream), 0, sizeof(stream)); + + checkError( + ::lzma_easy_encoder(&stream, preset, check)); + + setp(&obuffer[0], &obuffer[0] + obuffer.size()); + } + + LzmaStreamBuf::~LzmaStreamBuf() + { + ::lzma_end(&stream); + } + + LzmaStreamBuf::int_type LzmaStreamBuf::overflow(int_type c) + { + // initialize input-stream + stream.next_in = reinterpret_cast(&obuffer[0]); + stream.avail_in = pptr() - &obuffer[0]; + + // initialize zbuffer for compressed data + char zbuffer[8192]; + stream.next_out = reinterpret_cast(zbuffer); + stream.avail_out = sizeof(zbuffer); + + // compress + checkError(::lzma_code(&stream, LZMA_RUN)); + + // copy zbuffer to sink / consume deflated data + std::streamsize count = sizeof(zbuffer) - stream.avail_out; + if (count > 0) + { + std::streamsize n = sink->sputn(zbuffer, count); + if (n < count) + return traits_type::eof(); + } + + // move remaining characters to start of obuffer + if (stream.avail_in > 0) + memmove(&obuffer[0], stream.next_in, stream.avail_in); + + // reset outbuffer + setp(&obuffer[0] + stream.avail_in, &obuffer[0] + obuffer.size()); + if (c != traits_type::eof()) + sputc(traits_type::to_char_type(c)); + + return 0; + } + + LzmaStreamBuf::int_type LzmaStreamBuf::underflow() + { + return traits_type::eof(); + } + + int LzmaStreamBuf::sync() + { + // initialize input-stream for + stream.next_in = reinterpret_cast(&obuffer[0]); + stream.avail_in = pptr() - pbase(); + char zbuffer[8192]; + while (stream.avail_in > 0) + { + // initialize zbuffer + stream.next_out = (uint8_t*)zbuffer; + stream.avail_out = sizeof(zbuffer); + + checkError(::lzma_code(&stream, LZMA_FINISH)); + + // copy zbuffer to sink + std::streamsize count = sizeof(zbuffer) - stream.avail_out; + if (count > 0) + { + std::streamsize n = sink->sputn(zbuffer, count); + if (n < count) + return -1; + } + }; + + // reset outbuffer + setp(&obuffer[0], &obuffer[0] + obuffer.size()); + return 0; + } + + int LzmaStreamBuf::end() + { + char zbuffer[8192]; + // initialize input-stream for + stream.next_in = reinterpret_cast(&obuffer[0]); + stream.avail_in = pptr() - pbase(); + lzma_ret ret; + do + { + // initialize zbuffer + stream.next_out = (uint8_t*)zbuffer; + stream.avail_out = sizeof(zbuffer); + + ret = checkError(::lzma_code(&stream, LZMA_FINISH)); + + // copy zbuffer to sink + std::streamsize count = sizeof(zbuffer) - stream.avail_out; + if (count > 0) + { + std::streamsize n = sink->sputn(zbuffer, count); + if (n < count) + return -1; + } + } while (ret != LZMA_STREAM_END); + + // reset outbuffer + setp(&obuffer[0], &obuffer[0] + obuffer.size()); + return 0; + } + + void LzmaStream::end() + { + if (streambuf.end() != 0) + setstate(failbit); + } + +} diff --git a/src/zimlib/src/search.cpp b/src/zimlib/src/search.cpp index 79d2e2b..4affbb6 100644 --- a/src/zimlib/src/search.cpp +++ b/src/zimlib/src/search.cpp @@ -39,8 +39,8 @@ namespace zim bool operator() (const SearchResult& s1, const SearchResult& s2) const { return s1.getPriority() > s2.getPriority() - || s1.getPriority() == s2.getPriority() - && s1.getArticle().getTitle() > s2.getArticle().getTitle(); + || (s1.getPriority() == s2.getPriority() + && s1.getArticle().getTitle() > s2.getArticle().getTitle()); } }; } @@ -68,7 +68,7 @@ namespace zim + Search::getWeightOccOff() + Search::getWeightPlus() * itw->second.addweight; - std::string title = article.getTitle().toUtf8(); + std::string title = article.getTitle(); for (std::string::iterator it = title.begin(); it != title.end(); ++it) *it = std::tolower(*it); @@ -165,8 +165,7 @@ namespace zim log_debug("search for token \"" << token << '"'); - QUnicodeString qtoken = QUnicodeString::fromUtf8(token); - IndexArticle indexarticle = indexfile.getArticle('X', qtoken, true); + IndexArticle indexarticle = indexfile.getArticleByTitle('X', token); if (indexarticle.getTotalCount() > 0) { @@ -190,7 +189,7 @@ namespace zim { log_debug("no entries found - try searching for titles"); Results results; - find(results, 'A', qtoken); + find(results, 'A', token); for (Results::const_iterator it = results.begin(); it != results.end(); ++it) { uint32_t articleIdx = it->getArticle().getIndex(); @@ -224,13 +223,13 @@ namespace zim std::sort(results.begin(), results.end(), PriorityGt()); } - void Search::find(Results& results, char ns, const QUnicodeString& praefix, unsigned limit) + void Search::find(Results& results, char ns, const std::string& praefix, unsigned limit) { log_debug("find results in namespace " << ns << " for praefix \"" << praefix << '"'); - for (File::const_iterator pos = articlefile.find(ns, praefix, true); + for (File::const_iterator pos = articlefile.findByTitle(ns, praefix); pos != articlefile.end() && results.size() < limit; ++pos) { - if (ns != pos->getNamespace() || pos->getTitle().compareCollate(0, praefix.size(), praefix) > 0) + if (ns != pos->getNamespace() || pos->getTitle().compare(0, praefix.size(), praefix) > 0) { log_debug("article " << pos->getNamespace() << ", \"" << pos->getTitle() << "\" does not match " << ns << ", \"" << praefix << '"'); break; @@ -240,17 +239,17 @@ namespace zim log_debug(results.size() << " articles in result"); } - void Search::find(Results& results, char ns, const QUnicodeString& begin, - const QUnicodeString& end, unsigned limit) + void Search::find(Results& results, char ns, const std::string& begin, + const std::string& end, unsigned limit) { log_debug("find results in namespace " << ns << " for praefix \"" << begin << '"'); - for (File::const_iterator pos = articlefile.find(ns, begin, true); + for (File::const_iterator pos = articlefile.findByTitle(ns, begin); pos != articlefile.end() && results.size() < limit; ++pos) { log_debug("check " << pos->getNamespace() << '/' << pos->getTitle()); - if (pos->getNamespace() != ns || pos->getTitle().compareCollate(0, end.size(), end) > 0) + if (pos->getNamespace() != ns || pos->getTitle().compare(end) > 0) { - log_debug("article \"" << pos->getUrl() << "\" does not match"); + log_debug("article " << pos->getNamespace() << ", \"" << pos->getTitle() << "\" does not match"); break; } results.push_back(SearchResult(*pos)); diff --git a/src/zimlib/src/unlzmastream.cpp b/src/zimlib/src/unlzmastream.cpp new file mode 100644 index 0000000..4f5555e --- /dev/null +++ b/src/zimlib/src/unlzmastream.cpp @@ -0,0 +1,163 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + + +#include "zim/unlzmastream.h" +#include "log.h" +#include "config.h" +#include +#include +#include "envvalue.h" + +log_define("zim.lzma.uncompress") + +namespace zim +{ + namespace + { + lzma_ret checkError(lzma_ret ret) + { + if (ret != LZMA_OK && ret != LZMA_STREAM_END) + { + std::ostringstream msg; + msg << "inflate-error " << ret; + switch (ret) + { + case LZMA_OK: msg << ": LZMA_OK"; break; + case LZMA_STREAM_END: msg << ": LZMA_STREAM_END"; break; + case LZMA_NO_CHECK: msg << ": LZMA_NO_CHECK"; break; + case LZMA_UNSUPPORTED_CHECK: msg << ": LZMA_UNSUPPORTED_CHECK"; break; + case LZMA_GET_CHECK: msg << ": LZMA_GET_CHECK"; break; + case LZMA_MEM_ERROR: msg << ": LZMA_MEM_ERROR"; break; + case LZMA_MEMLIMIT_ERROR: msg << ": LZMA_MEMLIMIT_ERROR"; break; + case LZMA_FORMAT_ERROR: msg << ": LZMA_FORMAT_ERROR"; break; + case LZMA_OPTIONS_ERROR: msg << ": LZMA_OPTIONS_ERROR"; break; + case LZMA_DATA_ERROR: msg << ": LZMA_DATA_ERROR"; break; + case LZMA_BUF_ERROR: msg << ": LZMA_BUF_ERROR"; break; + case LZMA_PROG_ERROR: msg << ": LZMA_PROG_ERROR"; break; + } + log_error(msg); + throw UnlzmaError(ret, msg.str()); + } + return ret; + } + + } + + UnlzmaStreamBuf::UnlzmaStreamBuf(std::streambuf* sinksource_, unsigned bufsize_) + : iobuffer(new char_type[bufsize_]), + bufsize(bufsize_), + sinksource(sinksource_) + { + std::memset(reinterpret_cast(&stream), 0, sizeof(stream)); + + unsigned memsize = envMemSize("ZIM_LZMA_MEMORY_SIZE", LZMA_MEMORY_SIZE * 1024 * 1024); + checkError( + ::lzma_stream_decoder(&stream, memsize, 0)); + } + + UnlzmaStreamBuf::~UnlzmaStreamBuf() + { + ::lzma_end(&stream); + delete[] iobuffer; + } + + UnlzmaStreamBuf::int_type UnlzmaStreamBuf::overflow(int_type c) + { + if (pptr()) + { + // initialize input-stream for + stream.next_in = reinterpret_cast(obuffer()); + stream.avail_in = pptr() - pbase(); + + lzma_ret ret; + do + { + // initialize ibuffer + stream.next_out = reinterpret_cast(ibuffer()); + stream.avail_out = ibuffer_size(); + + ret = ::lzma_code(&stream, LZMA_RUN); + checkError(ret); + + // copy zbuffer to sinksource + std::streamsize count = ibuffer_size() - stream.avail_out; + std::streamsize n = sinksource->sputn(reinterpret_cast(ibuffer()), count); + if (n < count) + return traits_type::eof(); + } while (ret != LZMA_STREAM_END && stream.avail_in > 0); + } + + // reset outbuffer + setp(obuffer(), obuffer() + obuffer_size()); + if (c != traits_type::eof()) + sputc(traits_type::to_char_type(c)); + + return 0; + } + + UnlzmaStreamBuf::int_type UnlzmaStreamBuf::underflow() + { + // read from sinksource and decompress into obuffer + + stream.next_out = reinterpret_cast(obuffer()); + stream.avail_out = obuffer_size(); + + do + { + // fill ibuffer first if needed + if (stream.avail_in == 0) + { + if (sinksource->in_avail() > 0) + { + // there is data already available + // read compressed data from source into ibuffer + stream.avail_in = sinksource->sgetn(ibuffer(), std::min(sinksource->in_avail(), ibuffer_size())); + } + else + { + // no data available + stream.avail_in = sinksource->sgetn(ibuffer(), ibuffer_size()); + if (stream.avail_in == 0) + return traits_type::eof(); + } + + stream.next_in = (const uint8_t*)ibuffer(); + } + + // we decompress it now into obuffer + + // at least one character received from source - pass to decompressor + + checkError(::lzma_code(&stream, LZMA_RUN)); + + setg(obuffer(), obuffer(), obuffer() + obuffer_size() - stream.avail_out); + + } while (gptr() == egptr()); + + return sgetc(); + } + + int UnlzmaStreamBuf::sync() + { + if (pptr() && overflow(traits_type::eof()) == traits_type::eof()) + return -1; + return 0; + } +} diff --git a/src/zimlib/src/zintstream.cpp b/src/zimlib/src/zintstream.cpp index a24c86b..6ce9259 100644 --- a/src/zimlib/src/zintstream.cpp +++ b/src/zimlib/src/zintstream.cpp @@ -18,86 +18,85 @@ */ #include +#include #include "log.h" log_define("zim.zintstream") namespace zim { - IZIntStream& IZIntStream::get(unsigned &value) + size_type ZIntStream::get() { char ch; - if (!stream.get(ch)) + if (!_istream->get(ch)) return *this; - unsigned ret = static_cast(static_cast(ch)); - unsigned numb = ret & 0x3; - ret >>= 2; - unsigned s = 6; - while (numb && stream.get(ch)) + if (ch == '\xff') { - ret += static_cast( - static_cast(ch)) + 1 << s; - s += 8; - --numb; + log_error("invalid bytestream in int decompressor"); + _istream->setstate(std::ios::failbit); + } + + size_type uuvalue = static_cast(static_cast(ch)); + uint64_t ubound = 0x80; + size_type add = 0; + unsigned short s = 7; + unsigned short N = 0; + size_type mask = 0x7F; + while (ch & 0x80) + { + ++N; + ch <<= 1; + --s; + add += ubound; + ubound <<= 7; + mask >>= 1; } - if (numb) + uuvalue &= mask; + + while (N-- && _istream->get(ch)) { - log_error("incomplete bytestream"); - stream.setstate(std::ios::failbit); + uuvalue |= static_cast(static_cast(ch)) << s; + s += 8; + } + + if (_istream) + { + uuvalue += add; } else - value = ret; + { + log_error("incomplete bytestream in int decompressor"); + _istream->setstate(std::ios::failbit); + } - return *this; + return uuvalue; } - OZIntStream& OZIntStream::put(size_type value) + ZIntStream& ZIntStream::put(size_type value) { - char data[4]; - unsigned count; - if (value < 64) + size_type nmask = 0; + size_type mask = 0x7F; + uint64_t ubound = 0x80; + unsigned short N = 0; + + while (value >= ubound) { - count = 1; - data[0] = (value << 2); - log_debug(value << " => " << std::hex << static_cast(static_cast(data[0]))); - } - else if (value < 16384 + 64) - { - value -= 64; - count = 2; - data[0] = value << 2 | 1; - data[1] = value >> 6; - log_debug(value << " => " << std::hex << static_cast(static_cast(data[0])) - << std::hex << static_cast(static_cast(data[1]))); - } - else if (value < 4194304 + 16384 + 64) - { - value -= 16384 + 64; - count = 3; - data[0] = value << 2 | 2; - data[1] = value >> 6; - data[2] = value >> 14; - log_debug(value << " => " << std::hex << static_cast(static_cast(data[0])) - << std::hex << static_cast(static_cast(data[1])) - << std::hex << static_cast(static_cast(data[2]))); - } - else - { - value -= 4194304 + 16384 + 64; - count = 4; - data[0] = value << 2 | 3; - data[1] = value >> 6; - data[2] = value >> 14; - data[3] = value >> 22; - log_debug(value << " => " << std::hex << static_cast(static_cast(data[0])) - << std::hex << static_cast(static_cast(data[1])) - << std::hex << static_cast(static_cast(data[2])) - << std::hex << static_cast(static_cast(data[4]))); + value -= ubound; + ubound <<= 7; + nmask = (nmask >> 1) | 0x80; + mask = mask >> 1; + ++N; } - stream.write(reinterpret_cast(&data[0]), count); + _ostream->put(static_cast(nmask | (value & mask))); + value >>= 7 - N; + while (N--) + { + _ostream->put(static_cast(value & 0xFF)); + value >>= 8; + } return *this; }