+ version of zimlib working with own fstream class (good for windows and file>2GB)

This commit is contained in:
kelson42 2010-04-05 19:31:48 +00:00
parent 247e0659f9
commit 91ee095afb
9 changed files with 325 additions and 43 deletions

View File

@ -20,10 +20,10 @@
#ifndef ZIM_FILEIMPL_H
#define ZIM_FILEIMPL_H
#include <fstream>
#include <string>
#include <vector>
#include <map>
#include <zim/fstream.h>
#include <zim/refcounted.h>
#include <zim/zim.h>
#include <zim/fileheader.h>
@ -35,7 +35,7 @@ namespace zim
{
class FileImpl : public RefCounted
{
std::ifstream zimFile;
ifstream zimFile;
Fileheader header;
std::string filename;

View File

@ -0,0 +1,134 @@
/*
* Copyright (C) 2010 Tommi Maekitalo
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
* NON-INFRINGEMENT. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#ifndef ZIM_FSTREAM_H
#define ZIM_FSTREAM_H
#include <iostream>
#include <vector>
#include <zim/zim.h>
namespace zim
{
class streambuf : public std::streambuf
{
std::vector<char> buffer;
int fd;
std::streambuf::int_type overflow(std::streambuf::int_type ch);
std::streambuf::int_type underflow();
int sync();
public:
typedef zim::offset_type offset_type;
streambuf(const char* fname, unsigned bufsize);
~streambuf();
void seekg(offset_type off);
void setBufsize(unsigned s)
{ buffer.resize(s); }
};
class ifstream : public std::iostream
{
streambuf myStreambuf;
public:
typedef streambuf::offset_type offset_type;
explicit ifstream(const char* fname, unsigned bufsize = 8192)
: std::iostream(&myStreambuf),
myStreambuf(fname, bufsize)
{ }
void seekg(offset_type off) { myStreambuf.seekg(off); }
void setBufsize(unsigned s) { myStreambuf.setBufsize(s); }
};
}
#endif // ZIM_FSTREAM_H
/*
* Copyright (C) 2010 Tommi Maekitalo
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
* NON-INFRINGEMENT. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#ifndef ZIM_FSTREAM_H
#define ZIM_FSTREAM_H
#include <iostream>
#include <vector>
#include <zim/zim.h>
namespace zim
{
class streambuf : public std::streambuf
{
std::vector<char> buffer;
int fd;
std::streambuf::int_type overflow(std::streambuf::int_type ch);
std::streambuf::int_type underflow();
int sync();
public:
typedef zim::offset_type offset_type;
streambuf(const char* fname, unsigned bufsize);
~streambuf();
void seekg(offset_type off);
void setBufsize(unsigned s)
{ buffer.resize(s); }
};
class ifstream : public std::iostream
{
streambuf myStreambuf;
public:
typedef streambuf::offset_type offset_type;
explicit ifstream(const char* fname, unsigned bufsize = 8192)
: std::iostream(&myStreambuf),
myStreambuf(fname, bufsize)
{ }
void seekg(offset_type off) { myStreambuf.seekg(off); }
void setBufsize(unsigned s) { myStreambuf.setBufsize(s); }
};
}
#endif // ZIM_FSTREAM_H

View File

@ -38,7 +38,7 @@ namespace zim
};
typedef std::map<std::string, WordAttr> WordListType; // map word => count and addweight
typedef std::map<uint32_t, std::string> PosListType; // map position => word
typedef std::map<size_type, std::string> PosListType; // map position => word
WordListType wordList;
PosListType posList;
@ -50,7 +50,7 @@ namespace zim
{ }
const Article& getArticle() const { return article; }
double getPriority() const;
void foundWord(const std::string& word, uint32_t pos, unsigned addweight);
void foundWord(const std::string& word, size_type pos, unsigned addweight);
unsigned getCountWords() const { return wordList.size(); }
unsigned getCountPositions() const { return posList.size(); }
};
@ -83,6 +83,9 @@ namespace zim
File articlefile;
public:
Search()
{ }
explicit Search(const File& zimfile)
: indexfile(zimfile),
articlefile(zimfile)

View File

@ -20,18 +20,74 @@
#ifndef ZIM_ZIM_H
#define ZIM_ZIM_H
#include <stdint.h>
#ifdef _WIN32
typedef unsigned __int64 uint64_t;
typedef signed __int8 int8_t;
typedef signed __int32 int32_t;
#endif
#include <limits.h>
namespace zim
{
typedef uint32_t size_type;
typedef uint64_t offset_type;
#if USHRT_MAX == 0xffff
typedef unsigned short uint16_t;
#elif UINT_MAX == 0xffff
typedef unsigned int uint16_t;
#elif ULONG_MAX == 0xffff
typedef unsigned long uint16_t;
#else
}
#include <stdint.h>
namespace zim
{
#endif
#if USHRT_MAX == 0xffffffffUL
typedef unsigned short size_type;
#elif UINT_MAX == 0xffffffffUL
typedef unsigned int size_type;
#elif ULONG_MAX == 0xffffffffUL
typedef unsigned long size_type;
#else
}
#include <stdint.h>
namespace zim
{
typedef uint32_t size_type;
#endif
#if UINT_MAX == 18446744073709551615ULL
typedef unsigned int offset_type;
#elif ULONG_MAX == 18446744073709551615ULL
typedef unsigned long offset_type;
#elif ULLONG_MAX == 18446744073709551615ULL
typedef unsigned long long offset_type;
#else
}
#include <stdint.h>
namespace zim
{
typedef uint64_t offset_type;
#endif
enum CompressionType
{

View File

@ -1,9 +1,9 @@
/* src/zimlib/src/config.h.in. Generated from configure.ac by autoheader. */
/* src/config.h.in. Generated from configure.in by autoheader. */
/* set zim cluster cache size to number of cached chunks */
/* set cluster cache size to number of cached chunks */
#undef CLUSTER_CACHE_SIZE
/* set zim dirent cache size to number of cached chunks */
/* set dirent cache size to number of cached chunks */
#undef DIRENT_CACHE_SIZE
/* defined if bzip2 compression is enabled */
@ -21,21 +21,6 @@
/* Define to 1 if you have the <inttypes.h> header file. */
#undef HAVE_INTTYPES_H
/* Define to 1 if you have the `bz2' library (-lbz2). */
#undef HAVE_LIBBZ2
/* Define to 1 if you have the `lzma' library (-llzma). */
#undef HAVE_LIBLZMA
/* Define to 1 if you have the `microhttpd' library (-lmicrohttpd). */
#undef HAVE_LIBMICROHTTPD
/* Define to 1 if you have the `unac' library (-lunac). */
#undef HAVE_LIBUNAC
/* Define to 1 if you have the `z' library (-lz). */
#undef HAVE_LIBZ
/* Define to 1 if you have the <memory.h> header file. */
#undef HAVE_MEMORY_H
@ -96,3 +81,6 @@
/* Version number of package */
#undef VERSION
/* defined if cxxtools is used */
#undef WITH_CXXTOOLS

View File

@ -23,7 +23,6 @@
#include <zim/endian.h>
#include <sys/types.h>
#include <sys/stat.h>
//#include <unistd.h>
#include <sstream>
#include <errno.h>
#include "config.h"
@ -42,7 +41,7 @@ namespace zim
// FileImpl
//
FileImpl::FileImpl(const char* fname)
: zimFile(fname, std::ios::in | std::ios::binary),
: zimFile(fname),
direntCache(envValue("ZIM_DIRENTCACHE", DIRENT_CACHE_SIZE)),
clusterCache(envValue("ZIM_CLUSTERCACHE", CLUSTER_CACHE_SIZE))
{
@ -111,6 +110,8 @@ namespace zim
{
log_trace("FileImpl::getDirent(" << idx << ')');
zimFile.setBufsize(64);
if (idx >= getCountArticles())
throw ZimFileFormatError("article index out of range");
@ -192,6 +193,8 @@ namespace zim
return cluster;
}
zimFile.setBufsize(16384);
offset_type clusterOffset = getClusterOffset(idx);
log_debug("read cluster " << idx << " from offset " << clusterOffset);
zimFile.seekg(clusterOffset);

View File

@ -0,0 +1,97 @@
/*
* Copyright (C) 2010 Tommi Maekitalo
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
* NON-INFRINGEMENT. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include <zim/fstream.h>
#include "log.h"
#include "config.h"
#include <sstream>
#include <stdexcept>
#include <errno.h>
#include <string.h>
#include <fcntl.h>
log_define("zim.fstream")
namespace zim
{
std::streambuf::int_type streambuf::overflow(std::streambuf::int_type ch)
{
return traits_type::eof();
}
std::streambuf::int_type streambuf::underflow()
{
log_debug("underflow; bufsize=" << buffer.size());
int n = ::read(fd, &buffer[0], buffer.size());
if (n < 0)
{
std::ostringstream msg;
msg << "error " << errno << " reading from file: " << strerror(errno);
throw std::runtime_error(msg.str());
}
else if (n == 0)
return traits_type::eof();
char* p = &buffer[0];
setg(p, p, p + n);
return traits_type::to_int_type(*gptr());
}
int streambuf::sync()
{
return traits_type::eof();
}
streambuf::streambuf(const char* fname, unsigned bufsize)
: buffer(bufsize),
fd(::open64(fname, 0))
{
log_debug("streambuf for " << fname << " with " << bufsize << " bytes");
if (fd < 0)
{
std::ostringstream msg;
msg << "error " << errno << " opening file \"" << fname << "\": " << strerror(errno);
throw std::runtime_error(msg.str());
}
}
streambuf::~streambuf()
{
::close(fd);
}
void streambuf::seekg(offset_type off)
{
setg(0, 0, 0);
#ifdef HAVE_LSEEK64
off64_t ret = ::lseek64(fd, off, SEEK_SET);
#else
off_t ret = ::lseek(fd, off, SEEK_SET);
#endif
if (ret < 0)
{
std::ostringstream msg;
msg << "error " << errno << " seeking to "<< off << " in file: " << strerror(errno);
throw std::runtime_error(msg.str());
}
}
}

View File

@ -87,14 +87,14 @@ namespace zim
// weight distance between different words
PosListType::const_iterator itp = posList.begin();
std::string word = itp->second;
uint32_t pos = itp->first + word.size();
size_type pos = itp->first + word.size();
for (++itp; itp != posList.end(); ++itp)
{
if (word != itp->second)
{
uint32_t dist = itp->first > pos ? (itp->first - pos)
: itp->first < pos ? (pos - itp->first)
: 1;
size_type dist = itp->first > pos ? (itp->first - pos)
: itp->first < pos ? (pos - itp->first)
: 1;
priority += Search::getWeightDist() / dist;
}
word = itp->second;
@ -118,7 +118,7 @@ namespace zim
return priority;
}
void SearchResult::foundWord(const std::string& word, uint32_t pos, unsigned addweight)
void SearchResult::foundWord(const std::string& word, size_type pos, unsigned addweight)
{
++wordList[word].count;
wordList[word].addweight += addweight;
@ -142,7 +142,7 @@ namespace zim
std::string token;
// map from article-idx to article + relevance-informations
typedef std::map<uint32_t, SearchResult> IndexType;
typedef std::map<size_type, SearchResult> IndexType;
IndexType index;
while (ssearch >> token)
@ -174,8 +174,8 @@ namespace zim
const IndexArticle::EntriesType ent = indexarticle.getCategory(cat);
for (IndexArticle::EntriesType::const_iterator it = ent.begin(); it != ent.end(); ++it)
{
uint32_t articleIdx = it->index;
uint32_t position = it->pos;
size_type articleIdx = it->index;
size_type position = it->pos;
IndexType::iterator itIt = index.insert(
IndexType::value_type(articleIdx,
@ -192,7 +192,7 @@ namespace zim
find(results, 'A', token);
for (Results::const_iterator it = results.begin(); it != results.end(); ++it)
{
uint32_t articleIdx = it->getArticle().getIndex();
size_type articleIdx = it->getArticle().getIndex();
IndexType::iterator itIt = index.insert(
IndexType::value_type(articleIdx,

View File

@ -19,6 +19,7 @@
#include <zim/uuid.h>
#include <iostream>
#include <unistd.h>
#include <time.h>
#include <zim/zim.h> // necessary to have the new types
#include "log.h"
@ -26,7 +27,7 @@
#include <cxxtools/md5stream.h>
#endif
#ifdef WIN32
#ifdef _WIN32
# include <time.h>
# include <windows.h>