+ last version of zimlib (allowing to deal with splitted ZIM files)

This commit is contained in:
kelson42 2010-08-22 18:16:35 +00:00
parent 52a1eb82a4
commit 8be1d3a2d5
4 changed files with 258 additions and 76 deletions

View File

@ -23,43 +23,75 @@
#include <iostream> #include <iostream>
#include <vector> #include <vector>
#include <zim/zim.h> #include <zim/zim.h>
#include <zim/smartptr.h>
#include <zim/cache.h>
#include <zim/refcounted.h>
namespace zim namespace zim
{ {
class streambuf : public std::streambuf class streambuf : public std::streambuf
{ {
struct FileInfo : public RefCounted
{
std::string fname;
zim::offset_type fsize;
FileInfo() { }
FileInfo(const std::string& fname_, int fd);
};
struct OpenfileInfo : public RefCounted
{
std::string fname;
int fd;
explicit OpenfileInfo(const std::string& fname);
~OpenfileInfo();
};
typedef SmartPtr<FileInfo> FileInfoPtr;
typedef std::vector<FileInfoPtr> FilesType;
typedef SmartPtr<OpenfileInfo> OpenfileInfoPtr;
typedef Cache<std::string, OpenfileInfoPtr> OpenFilesCacheType;
std::vector<char> buffer; std::vector<char> buffer;
int fd;
FilesType files;
OpenFilesCacheType openFilesCache;
OpenfileInfoPtr currentFile;
offset_type currentPos;
std::streambuf::int_type overflow(std::streambuf::int_type ch); std::streambuf::int_type overflow(std::streambuf::int_type ch);
std::streambuf::int_type underflow(); std::streambuf::int_type underflow();
int sync(); int sync();
void setCurrentFile(const std::string& fname, off_t off);
public: public:
typedef zim::offset_type offset_type; streambuf(const std::string& fname, unsigned bufsize, unsigned openFilesCache);
streambuf(const char* fname, unsigned bufsize); void seekg(zim::offset_type off);
~streambuf();
void seekg(offset_type off);
void setBufsize(unsigned s) void setBufsize(unsigned s)
{ buffer.resize(s); } { buffer.resize(s); }
zim::offset_type fsize() const;
}; };
class ifstream : public std::iostream class ifstream : public std::istream
{ {
streambuf myStreambuf; streambuf myStreambuf;
public: public:
typedef streambuf::offset_type offset_type; explicit ifstream(const std::string& fname, unsigned bufsize = 8192, unsigned openFilesCache = 5)
: std::istream(0),
myStreambuf(fname, bufsize, openFilesCache)
{
init(&myStreambuf);
}
explicit ifstream(const char* fname, unsigned bufsize = 8192) void seekg(zim::offset_type off) { myStreambuf.seekg(off); }
: std::iostream(&myStreambuf),
myStreambuf(fname, bufsize)
{ }
void seekg(offset_type off) { myStreambuf.seekg(off); }
void setBufsize(unsigned s) { myStreambuf.setBufsize(s); } void setBufsize(unsigned s) { myStreambuf.setBufsize(s); }
zim::offset_type fsize() const { return myStreambuf.fsize(); }
}; };
} }

View File

@ -24,6 +24,9 @@
/* Define to 1 if you have the `bz2' library (-lbz2). */ /* Define to 1 if you have the `bz2' library (-lbz2). */
#undef HAVE_LIBBZ2 #undef HAVE_LIBBZ2
/* Define to 1 if you have the `clucene' library (-lclucene). */
#undef HAVE_LIBCLUCENE
/* Define to 1 if you have the `lzma' library (-llzma). */ /* Define to 1 if you have the `lzma' library (-llzma). */
#undef HAVE_LIBLZMA #undef HAVE_LIBLZMA

View File

@ -50,28 +50,6 @@ namespace zim
if (!zimFile) if (!zimFile)
throw ZimFileFormatError(std::string("can't open zim-file \"") + fname + '"'); throw ZimFileFormatError(std::string("can't open zim-file \"") + fname + '"');
#ifdef HAVE_STAT64
struct stat64 st;
int ret = ::stat64(fname, &st);
#elif _WIN32
struct __stat64 st;
int ret = ::_stat64(fname, &st);
#else
struct stat st;
int ret = ::stat(fname, &st);
#endif
if (ret != 0)
#ifdef WITH_CXXTOOLS
throw cxxtools::SystemError("stat");
#else
{
std::ostringstream msg;
msg << "stat failed with errno " << errno << " : " << strerror(errno);
throw std::runtime_error(msg.str());
}
#endif
mtime = st.st_mtime;
filename = fname; filename = fname;
// read header // read header
@ -84,10 +62,10 @@ namespace zim
else else
{ {
offset_type lastOffset = getClusterOffset(getCountClusters() - 1); offset_type lastOffset = getClusterOffset(getCountClusters() - 1);
log_debug("last offset=" << lastOffset << " file size=" << st.st_size); log_debug("last offset=" << lastOffset << " file size=" << zimFile.fsize());
if (lastOffset > static_cast<offset_type>(st.st_size)) if (lastOffset > static_cast<offset_type>(zimFile.fsize()))
{ {
log_fatal("last offset (" << lastOffset << ") larger than file size (" << st.st_size << ')'); log_fatal("last offset (" << lastOffset << ") larger than file size (" << zimFile.fsize() << ')');
throw ZimFileFormatError("last cluster offset larger than file size; file corrupt"); throw ZimFileFormatError("last cluster offset larger than file size; file corrupt");
} }
} }

View File

@ -25,19 +25,71 @@
#include <errno.h> #include <errno.h>
#include <string.h> #include <string.h>
#include <fcntl.h> #include <fcntl.h>
#include <unistd.h>
#include <sys/types.h> #include <sys/types.h>
#include <sys/stat.h> #include <sys/stat.h>
#ifdef _WIN32 #ifndef O_LARGEFILE
#include <io.h> #define O_LARGEFILE 0
int _fmode = _O_BINARY; #endif
#define _LARGEFILE64_SOURCE
#ifndef O_BINARY
#define O_BINARY 0
#endif #endif
log_define("zim.fstream") log_define("zim.fstream")
namespace zim namespace zim
{ {
class FileNotFound : public std::runtime_error
{
public:
FileNotFound()
: std::runtime_error("file not found")
{ }
};
////////////////////////////////////////////////////////////
// OpenfileInfo
//
streambuf::OpenfileInfo::OpenfileInfo(const std::string& fname_)
: fname(fname_),
#ifdef HAVE_OPEN64
fd(::open64(fname.c_str(), O_RDONLY | O_LARGEFILE | O_BINARY))
#else
fd(::open(fname.c_str(), O_RDONLY | O_LARGEFILE | O_BINARY))
#endif
{
if (fd < 0)
throw FileNotFound();
}
streambuf::OpenfileInfo::~OpenfileInfo()
{
::close(fd);
}
////////////////////////////////////////////////////////////
// FileInfo
//
streambuf::FileInfo::FileInfo(const std::string& fname_, int fd)
: fname(fname_)
{
#ifdef HAVE_LSEEK64
off64_t ret = ::lseek64(fd, 0, SEEK_END);
#else
off_t ret = ::lseek(fd, 0, SEEK_END);
#endif
if (ret < 0)
{
std::ostringstream msg;
msg << "error " << errno << " seeking to end in file " << fname << ": " << strerror(errno);
throw std::runtime_error(msg.str());
}
fsize = static_cast<offset_type>(ret);
}
std::streambuf::int_type streambuf::overflow(std::streambuf::int_type ch) std::streambuf::int_type streambuf::overflow(std::streambuf::int_type ch)
{ {
return traits_type::eof(); return traits_type::eof();
@ -47,15 +99,34 @@ std::streambuf::int_type streambuf::underflow()
{ {
log_debug("underflow; bufsize=" << buffer.size()); log_debug("underflow; bufsize=" << buffer.size());
int n = ::read(fd, &buffer[0], buffer.size()); int n;
if (n < 0) do
{ {
std::ostringstream msg; n = ::read(currentFile->fd, &buffer[0], buffer.size());
msg << "error " << errno << " reading from file: " << strerror(errno); if (n < 0)
throw std::runtime_error(msg.str()); {
} std::ostringstream msg;
else if (n == 0) msg << "error " << errno << " reading from file: " << strerror(errno);
return traits_type::eof(); throw std::runtime_error(msg.str());
}
else if (n == 0)
{
FilesType::iterator it;
for (it = files.begin(); it != files.end(); ++it)
{
if ((*it)->fname == currentFile->fname)
{
++it;
break;
}
}
if (it == files.end())
return traits_type::eof();
setCurrentFile((*it)->fname, 0);
}
} while (n == 0);
char* p = &buffer[0]; char* p = &buffer[0];
setg(p, p, p + n); setg(p, p, p + n);
@ -67,46 +138,144 @@ int streambuf::sync()
return traits_type::eof(); return traits_type::eof();
} }
streambuf::streambuf(const char* fname, unsigned bufsize) namespace
{
void parseFilelist(const std::string& list, std::vector<std::string>& out)
{
enum {
state_0,
state_t,
state_e
} state = state_0;
for (std::string::const_iterator it = list.begin(); it != list.end(); ++it)
{
switch (state)
{
case state_0:
out.push_back(std::string(1, *it));
state = state_t;
break;
case state_t:
if (*it == ':')
out.push_back(std::string(1, *it));
else if (*it == '\\')
state = state_e;
else
out.back() += *it;
break;
case state_e:
out.back() += *it;
state = state_t;
break;
}
}
}
}
streambuf::streambuf(const std::string& fname, unsigned bufsize, unsigned noOpenFiles)
: buffer(bufsize), : buffer(bufsize),
#ifdef HAVE_OPEN64 openFilesCache(noOpenFiles)
fd(::open64(fname, 0))
#else
fd(::open(fname, 0))
#endif
{ {
log_debug("streambuf for " << fname << " with " << bufsize << " bytes"); log_debug("streambuf for " << fname << " with " << bufsize << " bytes");
if (fd < 0) try
{ {
std::ostringstream msg; currentFile = new OpenfileInfo(fname);
msg << "error " << errno << " opening file \"" << fname << "\": " << strerror(errno); files.push_back(new FileInfo(fname, currentFile->fd));
throw std::runtime_error(msg.str()); openFilesCache.put(fname, currentFile);
}
catch (const FileNotFound&)
{
int errnoSave = errno;
try
{
for (char ch0 = 'a'; ch0 <= 'z'; ++ch0)
{
std::string fname0 = fname + ch0;
for (char ch1 = 'a'; ch1 <= 'z'; ++ch1)
{
std::string fname1 = fname0 + ch1;
currentFile = new OpenfileInfo(fname1);
files.push_back(new FileInfo(fname1, currentFile->fd));
openFilesCache.put(fname1, currentFile);
}
}
}
catch (const FileNotFound&)
{
if (files.empty())
{
std::ostringstream msg;
msg << "error " << errnoSave << " opening file \"" << fname << "\": " << strerror(errnoSave);
throw std::runtime_error(msg.str());
}
}
}
setCurrentFile((*files.begin())->fname, 0);
}
void streambuf::setCurrentFile(const std::string& fname, off_t off)
{
std::pair<bool, OpenfileInfoPtr> f = openFilesCache.getx(fname);
if (f.first)
{
currentFile = f.second;
}
else
{
// file not found in cache
currentFile = new OpenfileInfo(fname);
openFilesCache.put(fname, currentFile);
}
if (f.first || off != 0) // found in cache or seek requested
{
#ifdef HAVE_LSEEK64
off64_t ret = ::lseek64(currentFile->fd, off, SEEK_SET);
#else
off_t ret = ::lseek(currentFile->fd, off, SEEK_SET);
#endif
if (ret < 0)
{
std::ostringstream msg;
msg << "error " << errno << " seeking to "<< off << " in file " << fname << ": " << strerror(errno);
throw std::runtime_error(msg.str());
}
} }
} }
streambuf::~streambuf() void streambuf::seekg(zim::offset_type off)
{
::close(fd);
}
void streambuf::seekg(offset_type off)
{ {
setg(0, 0, 0); setg(0, 0, 0);
#ifdef HAVE_LSEEK64 currentPos = off;
off64_t ret = ::lseek64(fd, off, SEEK_SET);
#elif _WIN32
offset_type ret = ::_lseeki64(fd, off, SEEK_SET);
#else
off_t ret = ::lseek(fd, off, SEEK_SET);
#endif
if (ret < 0) zim::offset_type o = off;
FilesType::iterator it;
for (it = files.begin(); it != files.end() && (*it)->fsize < o; ++it)
o -= (*it)->fsize;
if (it == files.end())
{ {
std::ostringstream msg; std::ostringstream msg;
msg << "error " << errno << " seeking to "<< off << " in file: " << strerror(errno); msg << "error seeking to "<< off;
throw std::runtime_error(msg.str()); throw std::runtime_error(msg.str());
} }
setCurrentFile((*it)->fname, o);
}
zim::offset_type streambuf::fsize() const
{
zim::offset_type o = 0;
for (FilesType::const_iterator it = files.begin(); it != files.end(); ++it)
o += (*it)->fsize;
return o;
} }
} }