diff --git a/src/zimlib/include/zim/fileimpl.h b/src/zimlib/include/zim/fileimpl.h index 3632f0d..e0bda9b 100644 --- a/src/zimlib/include/zim/fileimpl.h +++ b/src/zimlib/include/zim/fileimpl.h @@ -20,10 +20,10 @@ #ifndef ZIM_FILEIMPL_H #define ZIM_FILEIMPL_H -#include #include #include #include +#include #include #include #include @@ -35,7 +35,7 @@ namespace zim { class FileImpl : public RefCounted { - std::ifstream zimFile; + ifstream zimFile; Fileheader header; std::string filename; diff --git a/src/zimlib/include/zim/fstream.h b/src/zimlib/include/zim/fstream.h new file mode 100644 index 0000000..20aee90 --- /dev/null +++ b/src/zimlib/include/zim/fstream.h @@ -0,0 +1,134 @@ +/* + * Copyright (C) 2010 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FSTREAM_H +#define ZIM_FSTREAM_H + +#include +#include +#include + +namespace zim +{ + class streambuf : public std::streambuf + { + std::vector buffer; + int fd; + + std::streambuf::int_type overflow(std::streambuf::int_type ch); + std::streambuf::int_type underflow(); + int sync(); + + public: + typedef zim::offset_type offset_type; + + streambuf(const char* fname, unsigned bufsize); + ~streambuf(); + + void seekg(offset_type off); + void setBufsize(unsigned s) + { buffer.resize(s); } + }; + + class ifstream : public std::iostream + { + streambuf myStreambuf; + + public: + typedef streambuf::offset_type offset_type; + + explicit ifstream(const char* fname, unsigned bufsize = 8192) + : std::iostream(&myStreambuf), + myStreambuf(fname, bufsize) + { } + + void seekg(offset_type off) { myStreambuf.seekg(off); } + void setBufsize(unsigned s) { myStreambuf.setBufsize(s); } + }; + +} + +#endif // ZIM_FSTREAM_H +/* + * Copyright (C) 2010 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FSTREAM_H +#define ZIM_FSTREAM_H + +#include +#include +#include + +namespace zim +{ + class streambuf : public std::streambuf + { + std::vector buffer; + int fd; + + std::streambuf::int_type overflow(std::streambuf::int_type ch); + std::streambuf::int_type underflow(); + int sync(); + + public: + typedef zim::offset_type offset_type; + + streambuf(const char* fname, unsigned bufsize); + ~streambuf(); + + void seekg(offset_type off); + void setBufsize(unsigned s) + { buffer.resize(s); } + }; + + class ifstream : public std::iostream + { + streambuf myStreambuf; + + public: + typedef streambuf::offset_type offset_type; + + explicit ifstream(const char* fname, unsigned bufsize = 8192) + : std::iostream(&myStreambuf), + myStreambuf(fname, bufsize) + { } + + void seekg(offset_type off) { myStreambuf.seekg(off); } + void setBufsize(unsigned s) { myStreambuf.setBufsize(s); } + }; + +} + +#endif // ZIM_FSTREAM_H diff --git a/src/zimlib/include/zim/search.h b/src/zimlib/include/zim/search.h index 6197d7c..adf9217 100644 --- a/src/zimlib/include/zim/search.h +++ b/src/zimlib/include/zim/search.h @@ -38,7 +38,7 @@ namespace zim }; typedef std::map WordListType; // map word => count and addweight - typedef std::map PosListType; // map position => word + typedef std::map PosListType; // map position => word WordListType wordList; PosListType posList; @@ -50,7 +50,7 @@ namespace zim { } const Article& getArticle() const { return article; } double getPriority() const; - void foundWord(const std::string& word, uint32_t pos, unsigned addweight); + void foundWord(const std::string& word, size_type pos, unsigned addweight); unsigned getCountWords() const { return wordList.size(); } unsigned getCountPositions() const { return posList.size(); } }; @@ -83,6 +83,9 @@ namespace zim File articlefile; public: + Search() + { } + explicit Search(const File& zimfile) : indexfile(zimfile), articlefile(zimfile) diff --git a/src/zimlib/include/zim/zim.h b/src/zimlib/include/zim/zim.h index e2eba8c..edff32d 100644 --- a/src/zimlib/include/zim/zim.h +++ b/src/zimlib/include/zim/zim.h @@ -20,18 +20,74 @@ #ifndef ZIM_ZIM_H #define ZIM_ZIM_H -#include - -#ifdef _WIN32 -typedef unsigned __int64 uint64_t; -typedef signed __int8 int8_t; -typedef signed __int32 int32_t; -#endif +#include namespace zim { - typedef uint32_t size_type; - typedef uint64_t offset_type; +#if USHRT_MAX == 0xffff + + typedef unsigned short uint16_t; + +#elif UINT_MAX == 0xffff + + typedef unsigned int uint16_t; + +#elif ULONG_MAX == 0xffff + + typedef unsigned long uint16_t; + +#else + +} +#include +namespace zim +{ + +#endif + +#if USHRT_MAX == 0xffffffffUL + + typedef unsigned short size_type; + +#elif UINT_MAX == 0xffffffffUL + + typedef unsigned int size_type; + +#elif ULONG_MAX == 0xffffffffUL + + typedef unsigned long size_type; + +#else + +} +#include +namespace zim +{ + typedef uint32_t size_type; + +#endif + +#if UINT_MAX == 18446744073709551615ULL + + typedef unsigned int offset_type; + +#elif ULONG_MAX == 18446744073709551615ULL + + typedef unsigned long offset_type; + +#elif ULLONG_MAX == 18446744073709551615ULL + + typedef unsigned long long offset_type; + +#else + +} +#include +namespace zim +{ + typedef uint64_t offset_type; + +#endif enum CompressionType { diff --git a/src/zimlib/src/config.h.in b/src/zimlib/src/config.h.in index fda56ee..465c47b 100644 --- a/src/zimlib/src/config.h.in +++ b/src/zimlib/src/config.h.in @@ -1,9 +1,9 @@ -/* src/zimlib/src/config.h.in. Generated from configure.ac by autoheader. */ +/* src/config.h.in. Generated from configure.in by autoheader. */ -/* set zim cluster cache size to number of cached chunks */ +/* set cluster cache size to number of cached chunks */ #undef CLUSTER_CACHE_SIZE -/* set zim dirent cache size to number of cached chunks */ +/* set dirent cache size to number of cached chunks */ #undef DIRENT_CACHE_SIZE /* defined if bzip2 compression is enabled */ @@ -21,21 +21,6 @@ /* Define to 1 if you have the header file. */ #undef HAVE_INTTYPES_H -/* Define to 1 if you have the `bz2' library (-lbz2). */ -#undef HAVE_LIBBZ2 - -/* Define to 1 if you have the `lzma' library (-llzma). */ -#undef HAVE_LIBLZMA - -/* Define to 1 if you have the `microhttpd' library (-lmicrohttpd). */ -#undef HAVE_LIBMICROHTTPD - -/* Define to 1 if you have the `unac' library (-lunac). */ -#undef HAVE_LIBUNAC - -/* Define to 1 if you have the `z' library (-lz). */ -#undef HAVE_LIBZ - /* Define to 1 if you have the header file. */ #undef HAVE_MEMORY_H @@ -96,3 +81,6 @@ /* Version number of package */ #undef VERSION + +/* defined if cxxtools is used */ +#undef WITH_CXXTOOLS diff --git a/src/zimlib/src/fileimpl.cpp b/src/zimlib/src/fileimpl.cpp index ae1038d..0cee279 100644 --- a/src/zimlib/src/fileimpl.cpp +++ b/src/zimlib/src/fileimpl.cpp @@ -23,7 +23,6 @@ #include #include #include -//#include #include #include #include "config.h" @@ -42,7 +41,7 @@ namespace zim // FileImpl // FileImpl::FileImpl(const char* fname) - : zimFile(fname, std::ios::in | std::ios::binary), + : zimFile(fname), direntCache(envValue("ZIM_DIRENTCACHE", DIRENT_CACHE_SIZE)), clusterCache(envValue("ZIM_CLUSTERCACHE", CLUSTER_CACHE_SIZE)) { @@ -111,6 +110,8 @@ namespace zim { log_trace("FileImpl::getDirent(" << idx << ')'); + zimFile.setBufsize(64); + if (idx >= getCountArticles()) throw ZimFileFormatError("article index out of range"); @@ -192,6 +193,8 @@ namespace zim return cluster; } + zimFile.setBufsize(16384); + offset_type clusterOffset = getClusterOffset(idx); log_debug("read cluster " << idx << " from offset " << clusterOffset); zimFile.seekg(clusterOffset); diff --git a/src/zimlib/src/fstream.cpp b/src/zimlib/src/fstream.cpp new file mode 100644 index 0000000..3236d3c --- /dev/null +++ b/src/zimlib/src/fstream.cpp @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2010 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include "log.h" +#include "config.h" +#include +#include +#include +#include +#include + +log_define("zim.fstream") + +namespace zim +{ +std::streambuf::int_type streambuf::overflow(std::streambuf::int_type ch) +{ + return traits_type::eof(); +} + +std::streambuf::int_type streambuf::underflow() +{ + log_debug("underflow; bufsize=" << buffer.size()); + + int n = ::read(fd, &buffer[0], buffer.size()); + if (n < 0) + { + std::ostringstream msg; + msg << "error " << errno << " reading from file: " << strerror(errno); + throw std::runtime_error(msg.str()); + } + else if (n == 0) + return traits_type::eof(); + + char* p = &buffer[0]; + setg(p, p, p + n); + return traits_type::to_int_type(*gptr()); +} + +int streambuf::sync() +{ + return traits_type::eof(); +} + +streambuf::streambuf(const char* fname, unsigned bufsize) + : buffer(bufsize), + fd(::open64(fname, 0)) +{ + log_debug("streambuf for " << fname << " with " << bufsize << " bytes"); + + if (fd < 0) + { + std::ostringstream msg; + msg << "error " << errno << " opening file \"" << fname << "\": " << strerror(errno); + throw std::runtime_error(msg.str()); + } +} + +streambuf::~streambuf() +{ + ::close(fd); +} + +void streambuf::seekg(offset_type off) +{ + setg(0, 0, 0); +#ifdef HAVE_LSEEK64 + off64_t ret = ::lseek64(fd, off, SEEK_SET); +#else + off_t ret = ::lseek(fd, off, SEEK_SET); +#endif + if (ret < 0) + { + std::ostringstream msg; + msg << "error " << errno << " seeking to "<< off << " in file: " << strerror(errno); + throw std::runtime_error(msg.str()); + } +} + +} diff --git a/src/zimlib/src/search.cpp b/src/zimlib/src/search.cpp index 4affbb6..d3503f9 100644 --- a/src/zimlib/src/search.cpp +++ b/src/zimlib/src/search.cpp @@ -87,14 +87,14 @@ namespace zim // weight distance between different words PosListType::const_iterator itp = posList.begin(); std::string word = itp->second; - uint32_t pos = itp->first + word.size(); + size_type pos = itp->first + word.size(); for (++itp; itp != posList.end(); ++itp) { if (word != itp->second) { - uint32_t dist = itp->first > pos ? (itp->first - pos) - : itp->first < pos ? (pos - itp->first) - : 1; + size_type dist = itp->first > pos ? (itp->first - pos) + : itp->first < pos ? (pos - itp->first) + : 1; priority += Search::getWeightDist() / dist; } word = itp->second; @@ -118,7 +118,7 @@ namespace zim return priority; } - void SearchResult::foundWord(const std::string& word, uint32_t pos, unsigned addweight) + void SearchResult::foundWord(const std::string& word, size_type pos, unsigned addweight) { ++wordList[word].count; wordList[word].addweight += addweight; @@ -142,7 +142,7 @@ namespace zim std::string token; // map from article-idx to article + relevance-informations - typedef std::map IndexType; + typedef std::map IndexType; IndexType index; while (ssearch >> token) @@ -174,8 +174,8 @@ namespace zim const IndexArticle::EntriesType ent = indexarticle.getCategory(cat); for (IndexArticle::EntriesType::const_iterator it = ent.begin(); it != ent.end(); ++it) { - uint32_t articleIdx = it->index; - uint32_t position = it->pos; + size_type articleIdx = it->index; + size_type position = it->pos; IndexType::iterator itIt = index.insert( IndexType::value_type(articleIdx, @@ -192,7 +192,7 @@ namespace zim find(results, 'A', token); for (Results::const_iterator it = results.begin(); it != results.end(); ++it) { - uint32_t articleIdx = it->getArticle().getIndex(); + size_type articleIdx = it->getArticle().getIndex(); IndexType::iterator itIt = index.insert( IndexType::value_type(articleIdx, diff --git a/src/zimlib/src/uuid.cpp b/src/zimlib/src/uuid.cpp index 8ecd0a1..94b5891 100644 --- a/src/zimlib/src/uuid.cpp +++ b/src/zimlib/src/uuid.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include // necessary to have the new types #include "log.h" @@ -26,7 +27,7 @@ #include #endif -#ifdef WIN32 +#ifdef _WIN32 # include # include