mirror of
https://github.com/kiwix/kiwix-tools.git
synced 2025-09-23 03:52:35 -04:00
+ version of zimlib working with own fstream class (good for windows and file>2GB)
This commit is contained in:
parent
247e0659f9
commit
91ee095afb
@ -20,10 +20,10 @@
|
||||
#ifndef ZIM_FILEIMPL_H
|
||||
#define ZIM_FILEIMPL_H
|
||||
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <zim/fstream.h>
|
||||
#include <zim/refcounted.h>
|
||||
#include <zim/zim.h>
|
||||
#include <zim/fileheader.h>
|
||||
@ -35,7 +35,7 @@ namespace zim
|
||||
{
|
||||
class FileImpl : public RefCounted
|
||||
{
|
||||
std::ifstream zimFile;
|
||||
ifstream zimFile;
|
||||
Fileheader header;
|
||||
std::string filename;
|
||||
|
||||
|
134
src/zimlib/include/zim/fstream.h
Normal file
134
src/zimlib/include/zim/fstream.h
Normal file
@ -0,0 +1,134 @@
|
||||
/*
|
||||
* Copyright (C) 2010 Tommi Maekitalo
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License as
|
||||
* published by the Free Software Foundation; either version 2 of the
|
||||
* License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
|
||||
* warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
|
||||
* NON-INFRINGEMENT. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef ZIM_FSTREAM_H
|
||||
#define ZIM_FSTREAM_H
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <zim/zim.h>
|
||||
|
||||
namespace zim
|
||||
{
|
||||
class streambuf : public std::streambuf
|
||||
{
|
||||
std::vector<char> buffer;
|
||||
int fd;
|
||||
|
||||
std::streambuf::int_type overflow(std::streambuf::int_type ch);
|
||||
std::streambuf::int_type underflow();
|
||||
int sync();
|
||||
|
||||
public:
|
||||
typedef zim::offset_type offset_type;
|
||||
|
||||
streambuf(const char* fname, unsigned bufsize);
|
||||
~streambuf();
|
||||
|
||||
void seekg(offset_type off);
|
||||
void setBufsize(unsigned s)
|
||||
{ buffer.resize(s); }
|
||||
};
|
||||
|
||||
class ifstream : public std::iostream
|
||||
{
|
||||
streambuf myStreambuf;
|
||||
|
||||
public:
|
||||
typedef streambuf::offset_type offset_type;
|
||||
|
||||
explicit ifstream(const char* fname, unsigned bufsize = 8192)
|
||||
: std::iostream(&myStreambuf),
|
||||
myStreambuf(fname, bufsize)
|
||||
{ }
|
||||
|
||||
void seekg(offset_type off) { myStreambuf.seekg(off); }
|
||||
void setBufsize(unsigned s) { myStreambuf.setBufsize(s); }
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif // ZIM_FSTREAM_H
|
||||
/*
|
||||
* Copyright (C) 2010 Tommi Maekitalo
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License as
|
||||
* published by the Free Software Foundation; either version 2 of the
|
||||
* License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
|
||||
* warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
|
||||
* NON-INFRINGEMENT. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef ZIM_FSTREAM_H
|
||||
#define ZIM_FSTREAM_H
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <zim/zim.h>
|
||||
|
||||
namespace zim
|
||||
{
|
||||
class streambuf : public std::streambuf
|
||||
{
|
||||
std::vector<char> buffer;
|
||||
int fd;
|
||||
|
||||
std::streambuf::int_type overflow(std::streambuf::int_type ch);
|
||||
std::streambuf::int_type underflow();
|
||||
int sync();
|
||||
|
||||
public:
|
||||
typedef zim::offset_type offset_type;
|
||||
|
||||
streambuf(const char* fname, unsigned bufsize);
|
||||
~streambuf();
|
||||
|
||||
void seekg(offset_type off);
|
||||
void setBufsize(unsigned s)
|
||||
{ buffer.resize(s); }
|
||||
};
|
||||
|
||||
class ifstream : public std::iostream
|
||||
{
|
||||
streambuf myStreambuf;
|
||||
|
||||
public:
|
||||
typedef streambuf::offset_type offset_type;
|
||||
|
||||
explicit ifstream(const char* fname, unsigned bufsize = 8192)
|
||||
: std::iostream(&myStreambuf),
|
||||
myStreambuf(fname, bufsize)
|
||||
{ }
|
||||
|
||||
void seekg(offset_type off) { myStreambuf.seekg(off); }
|
||||
void setBufsize(unsigned s) { myStreambuf.setBufsize(s); }
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif // ZIM_FSTREAM_H
|
@ -38,7 +38,7 @@ namespace zim
|
||||
};
|
||||
|
||||
typedef std::map<std::string, WordAttr> WordListType; // map word => count and addweight
|
||||
typedef std::map<uint32_t, std::string> PosListType; // map position => word
|
||||
typedef std::map<size_type, std::string> PosListType; // map position => word
|
||||
WordListType wordList;
|
||||
PosListType posList;
|
||||
|
||||
@ -50,7 +50,7 @@ namespace zim
|
||||
{ }
|
||||
const Article& getArticle() const { return article; }
|
||||
double getPriority() const;
|
||||
void foundWord(const std::string& word, uint32_t pos, unsigned addweight);
|
||||
void foundWord(const std::string& word, size_type pos, unsigned addweight);
|
||||
unsigned getCountWords() const { return wordList.size(); }
|
||||
unsigned getCountPositions() const { return posList.size(); }
|
||||
};
|
||||
@ -83,6 +83,9 @@ namespace zim
|
||||
File articlefile;
|
||||
|
||||
public:
|
||||
Search()
|
||||
{ }
|
||||
|
||||
explicit Search(const File& zimfile)
|
||||
: indexfile(zimfile),
|
||||
articlefile(zimfile)
|
||||
|
@ -20,18 +20,74 @@
|
||||
#ifndef ZIM_ZIM_H
|
||||
#define ZIM_ZIM_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef _WIN32
|
||||
typedef unsigned __int64 uint64_t;
|
||||
typedef signed __int8 int8_t;
|
||||
typedef signed __int32 int32_t;
|
||||
#endif
|
||||
#include <limits.h>
|
||||
|
||||
namespace zim
|
||||
{
|
||||
typedef uint32_t size_type;
|
||||
typedef uint64_t offset_type;
|
||||
#if USHRT_MAX == 0xffff
|
||||
|
||||
typedef unsigned short uint16_t;
|
||||
|
||||
#elif UINT_MAX == 0xffff
|
||||
|
||||
typedef unsigned int uint16_t;
|
||||
|
||||
#elif ULONG_MAX == 0xffff
|
||||
|
||||
typedef unsigned long uint16_t;
|
||||
|
||||
#else
|
||||
|
||||
}
|
||||
#include <stdint.h>
|
||||
namespace zim
|
||||
{
|
||||
|
||||
#endif
|
||||
|
||||
#if USHRT_MAX == 0xffffffffUL
|
||||
|
||||
typedef unsigned short size_type;
|
||||
|
||||
#elif UINT_MAX == 0xffffffffUL
|
||||
|
||||
typedef unsigned int size_type;
|
||||
|
||||
#elif ULONG_MAX == 0xffffffffUL
|
||||
|
||||
typedef unsigned long size_type;
|
||||
|
||||
#else
|
||||
|
||||
}
|
||||
#include <stdint.h>
|
||||
namespace zim
|
||||
{
|
||||
typedef uint32_t size_type;
|
||||
|
||||
#endif
|
||||
|
||||
#if UINT_MAX == 18446744073709551615ULL
|
||||
|
||||
typedef unsigned int offset_type;
|
||||
|
||||
#elif ULONG_MAX == 18446744073709551615ULL
|
||||
|
||||
typedef unsigned long offset_type;
|
||||
|
||||
#elif ULLONG_MAX == 18446744073709551615ULL
|
||||
|
||||
typedef unsigned long long offset_type;
|
||||
|
||||
#else
|
||||
|
||||
}
|
||||
#include <stdint.h>
|
||||
namespace zim
|
||||
{
|
||||
typedef uint64_t offset_type;
|
||||
|
||||
#endif
|
||||
|
||||
enum CompressionType
|
||||
{
|
||||
|
@ -1,9 +1,9 @@
|
||||
/* src/zimlib/src/config.h.in. Generated from configure.ac by autoheader. */
|
||||
/* src/config.h.in. Generated from configure.in by autoheader. */
|
||||
|
||||
/* set zim cluster cache size to number of cached chunks */
|
||||
/* set cluster cache size to number of cached chunks */
|
||||
#undef CLUSTER_CACHE_SIZE
|
||||
|
||||
/* set zim dirent cache size to number of cached chunks */
|
||||
/* set dirent cache size to number of cached chunks */
|
||||
#undef DIRENT_CACHE_SIZE
|
||||
|
||||
/* defined if bzip2 compression is enabled */
|
||||
@ -21,21 +21,6 @@
|
||||
/* Define to 1 if you have the <inttypes.h> header file. */
|
||||
#undef HAVE_INTTYPES_H
|
||||
|
||||
/* Define to 1 if you have the `bz2' library (-lbz2). */
|
||||
#undef HAVE_LIBBZ2
|
||||
|
||||
/* Define to 1 if you have the `lzma' library (-llzma). */
|
||||
#undef HAVE_LIBLZMA
|
||||
|
||||
/* Define to 1 if you have the `microhttpd' library (-lmicrohttpd). */
|
||||
#undef HAVE_LIBMICROHTTPD
|
||||
|
||||
/* Define to 1 if you have the `unac' library (-lunac). */
|
||||
#undef HAVE_LIBUNAC
|
||||
|
||||
/* Define to 1 if you have the `z' library (-lz). */
|
||||
#undef HAVE_LIBZ
|
||||
|
||||
/* Define to 1 if you have the <memory.h> header file. */
|
||||
#undef HAVE_MEMORY_H
|
||||
|
||||
@ -96,3 +81,6 @@
|
||||
|
||||
/* Version number of package */
|
||||
#undef VERSION
|
||||
|
||||
/* defined if cxxtools is used */
|
||||
#undef WITH_CXXTOOLS
|
||||
|
@ -23,7 +23,6 @@
|
||||
#include <zim/endian.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
//#include <unistd.h>
|
||||
#include <sstream>
|
||||
#include <errno.h>
|
||||
#include "config.h"
|
||||
@ -42,7 +41,7 @@ namespace zim
|
||||
// FileImpl
|
||||
//
|
||||
FileImpl::FileImpl(const char* fname)
|
||||
: zimFile(fname, std::ios::in | std::ios::binary),
|
||||
: zimFile(fname),
|
||||
direntCache(envValue("ZIM_DIRENTCACHE", DIRENT_CACHE_SIZE)),
|
||||
clusterCache(envValue("ZIM_CLUSTERCACHE", CLUSTER_CACHE_SIZE))
|
||||
{
|
||||
@ -111,6 +110,8 @@ namespace zim
|
||||
{
|
||||
log_trace("FileImpl::getDirent(" << idx << ')');
|
||||
|
||||
zimFile.setBufsize(64);
|
||||
|
||||
if (idx >= getCountArticles())
|
||||
throw ZimFileFormatError("article index out of range");
|
||||
|
||||
@ -192,6 +193,8 @@ namespace zim
|
||||
return cluster;
|
||||
}
|
||||
|
||||
zimFile.setBufsize(16384);
|
||||
|
||||
offset_type clusterOffset = getClusterOffset(idx);
|
||||
log_debug("read cluster " << idx << " from offset " << clusterOffset);
|
||||
zimFile.seekg(clusterOffset);
|
||||
|
97
src/zimlib/src/fstream.cpp
Normal file
97
src/zimlib/src/fstream.cpp
Normal file
@ -0,0 +1,97 @@
|
||||
/*
|
||||
* Copyright (C) 2010 Tommi Maekitalo
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License as
|
||||
* published by the Free Software Foundation; either version 2 of the
|
||||
* License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
|
||||
* warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
|
||||
* NON-INFRINGEMENT. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
#include <zim/fstream.h>
|
||||
#include "log.h"
|
||||
#include "config.h"
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
log_define("zim.fstream")
|
||||
|
||||
namespace zim
|
||||
{
|
||||
std::streambuf::int_type streambuf::overflow(std::streambuf::int_type ch)
|
||||
{
|
||||
return traits_type::eof();
|
||||
}
|
||||
|
||||
std::streambuf::int_type streambuf::underflow()
|
||||
{
|
||||
log_debug("underflow; bufsize=" << buffer.size());
|
||||
|
||||
int n = ::read(fd, &buffer[0], buffer.size());
|
||||
if (n < 0)
|
||||
{
|
||||
std::ostringstream msg;
|
||||
msg << "error " << errno << " reading from file: " << strerror(errno);
|
||||
throw std::runtime_error(msg.str());
|
||||
}
|
||||
else if (n == 0)
|
||||
return traits_type::eof();
|
||||
|
||||
char* p = &buffer[0];
|
||||
setg(p, p, p + n);
|
||||
return traits_type::to_int_type(*gptr());
|
||||
}
|
||||
|
||||
int streambuf::sync()
|
||||
{
|
||||
return traits_type::eof();
|
||||
}
|
||||
|
||||
streambuf::streambuf(const char* fname, unsigned bufsize)
|
||||
: buffer(bufsize),
|
||||
fd(::open64(fname, 0))
|
||||
{
|
||||
log_debug("streambuf for " << fname << " with " << bufsize << " bytes");
|
||||
|
||||
if (fd < 0)
|
||||
{
|
||||
std::ostringstream msg;
|
||||
msg << "error " << errno << " opening file \"" << fname << "\": " << strerror(errno);
|
||||
throw std::runtime_error(msg.str());
|
||||
}
|
||||
}
|
||||
|
||||
streambuf::~streambuf()
|
||||
{
|
||||
::close(fd);
|
||||
}
|
||||
|
||||
void streambuf::seekg(offset_type off)
|
||||
{
|
||||
setg(0, 0, 0);
|
||||
#ifdef HAVE_LSEEK64
|
||||
off64_t ret = ::lseek64(fd, off, SEEK_SET);
|
||||
#else
|
||||
off_t ret = ::lseek(fd, off, SEEK_SET);
|
||||
#endif
|
||||
if (ret < 0)
|
||||
{
|
||||
std::ostringstream msg;
|
||||
msg << "error " << errno << " seeking to "<< off << " in file: " << strerror(errno);
|
||||
throw std::runtime_error(msg.str());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -87,14 +87,14 @@ namespace zim
|
||||
// weight distance between different words
|
||||
PosListType::const_iterator itp = posList.begin();
|
||||
std::string word = itp->second;
|
||||
uint32_t pos = itp->first + word.size();
|
||||
size_type pos = itp->first + word.size();
|
||||
for (++itp; itp != posList.end(); ++itp)
|
||||
{
|
||||
if (word != itp->second)
|
||||
{
|
||||
uint32_t dist = itp->first > pos ? (itp->first - pos)
|
||||
: itp->first < pos ? (pos - itp->first)
|
||||
: 1;
|
||||
size_type dist = itp->first > pos ? (itp->first - pos)
|
||||
: itp->first < pos ? (pos - itp->first)
|
||||
: 1;
|
||||
priority += Search::getWeightDist() / dist;
|
||||
}
|
||||
word = itp->second;
|
||||
@ -118,7 +118,7 @@ namespace zim
|
||||
return priority;
|
||||
}
|
||||
|
||||
void SearchResult::foundWord(const std::string& word, uint32_t pos, unsigned addweight)
|
||||
void SearchResult::foundWord(const std::string& word, size_type pos, unsigned addweight)
|
||||
{
|
||||
++wordList[word].count;
|
||||
wordList[word].addweight += addweight;
|
||||
@ -142,7 +142,7 @@ namespace zim
|
||||
std::string token;
|
||||
|
||||
// map from article-idx to article + relevance-informations
|
||||
typedef std::map<uint32_t, SearchResult> IndexType;
|
||||
typedef std::map<size_type, SearchResult> IndexType;
|
||||
IndexType index;
|
||||
|
||||
while (ssearch >> token)
|
||||
@ -174,8 +174,8 @@ namespace zim
|
||||
const IndexArticle::EntriesType ent = indexarticle.getCategory(cat);
|
||||
for (IndexArticle::EntriesType::const_iterator it = ent.begin(); it != ent.end(); ++it)
|
||||
{
|
||||
uint32_t articleIdx = it->index;
|
||||
uint32_t position = it->pos;
|
||||
size_type articleIdx = it->index;
|
||||
size_type position = it->pos;
|
||||
|
||||
IndexType::iterator itIt = index.insert(
|
||||
IndexType::value_type(articleIdx,
|
||||
@ -192,7 +192,7 @@ namespace zim
|
||||
find(results, 'A', token);
|
||||
for (Results::const_iterator it = results.begin(); it != results.end(); ++it)
|
||||
{
|
||||
uint32_t articleIdx = it->getArticle().getIndex();
|
||||
size_type articleIdx = it->getArticle().getIndex();
|
||||
|
||||
IndexType::iterator itIt = index.insert(
|
||||
IndexType::value_type(articleIdx,
|
||||
|
@ -19,6 +19,7 @@
|
||||
|
||||
#include <zim/uuid.h>
|
||||
#include <iostream>
|
||||
#include <unistd.h>
|
||||
#include <time.h>
|
||||
#include <zim/zim.h> // necessary to have the new types
|
||||
#include "log.h"
|
||||
@ -26,7 +27,7 @@
|
||||
#include <cxxtools/md5stream.h>
|
||||
#endif
|
||||
|
||||
#ifdef WIN32
|
||||
#ifdef _WIN32
|
||||
|
||||
# include <time.h>
|
||||
# include <windows.h>
|
||||
|
Loading…
x
Reference in New Issue
Block a user