diff --git a/src/common/kiwix/library.h b/src/common/kiwix/library.h index dd1e28fd..4ae5b3a8 100644 --- a/src/common/kiwix/library.h +++ b/src/common/kiwix/library.h @@ -67,6 +67,7 @@ namespace kiwix { string publisher; string date; string url; + string origId; string articleCount; string mediaCount; bool readOnly; diff --git a/src/common/kiwix/manager.cpp b/src/common/kiwix/manager.cpp index a4925d83..bf2726d4 100644 --- a/src/common/kiwix/manager.cpp +++ b/src/common/kiwix/manager.cpp @@ -56,6 +56,7 @@ namespace kiwix { book.creator = bookNode.attribute("creator").value(); book.publisher = bookNode.attribute("publisher").value(); book.url = bookNode.attribute("url").value(); + book.origId = bookNode.attribute("origId").value(); book.articleCount = bookNode.attribute("articleCount").value(); book.mediaCount = bookNode.attribute("mediaCount").value(); book.size = bookNode.attribute("size").value(); @@ -154,41 +155,46 @@ namespace kiwix { bookNode.append_attribute("indexType") = "xapian"; } - if (!itr->title.empty()) - bookNode.append_attribute("title") = itr->title.c_str(); + if (itr->origId.empty()) { + if (!itr->title.empty()) + bookNode.append_attribute("title") = itr->title.c_str(); + + if (!itr->description.empty()) + bookNode.append_attribute("description") = itr->description.c_str(); + + if (!itr->language.empty()) + bookNode.append_attribute("language") = itr->language.c_str(); + + if (!itr->creator.empty()) + bookNode.append_attribute("creator") = itr->creator.c_str(); + + if (!itr->publisher.empty()) + bookNode.append_attribute("publisher") = itr->publisher.c_str(); + + if (!itr->favicon.empty()) + bookNode.append_attribute("favicon") = itr->favicon.c_str(); + + if (!itr->faviconMimeType.empty()) + bookNode.append_attribute("faviconMimeType") = itr->faviconMimeType.c_str(); + } - if (itr->description != "") - bookNode.append_attribute("description") = itr->description.c_str(); - - if (itr->language != "") - bookNode.append_attribute("language") = itr->language.c_str(); - - if (itr->date != "") + if (!itr->date.empty()) bookNode.append_attribute("date") = itr->date.c_str(); - if (itr->creator != "") - bookNode.append_attribute("creator") = itr->creator.c_str(); - - if (itr->publisher != "") - bookNode.append_attribute("publisher") = itr->publisher.c_str(); - - if (itr->url != "") + if (!itr->url.empty()) bookNode.append_attribute("url") = itr->url.c_str(); - - if (itr->articleCount != "") + + if (!itr->origId.empty()) + bookNode.append_attribute("origId") = itr->origId.c_str(); + + if (!itr->articleCount.empty()) bookNode.append_attribute("articleCount") = itr->articleCount.c_str(); - if (itr->mediaCount != "") + if (!itr->mediaCount.empty()) bookNode.append_attribute("mediaCount") = itr->mediaCount.c_str(); - if (itr->size != "") + if (!itr->size.empty()) bookNode.append_attribute("size") = itr->size.c_str(); - - if (itr->favicon != "") - bookNode.append_attribute("favicon") = itr->favicon.c_str(); - - if (itr->faviconMimeType != "") - bookNode.append_attribute("faviconMimeType") = itr->faviconMimeType.c_str(); } } @@ -256,7 +262,7 @@ namespace kiwix { book->creator = reader->getCreator(); book->publisher = reader->getPublisher(); book->title = reader->getTitle(); - + book->origId = reader->getOrigId(); std::ostringstream articleCountStream; articleCountStream << reader->getArticleCount(); book->articleCount = articleCountStream.str(); @@ -307,10 +313,12 @@ namespace kiwix { std::map booksLanguagesMap; std::sort(library.books.begin(), library.books.end(), kiwix::Book::sortByLanguage); - for ( itr = library.books.begin(); itr != library.books.end(); ++itr ) { + for (itr = library.books.begin(); itr != library.books.end(); ++itr) { if (booksLanguagesMap.find(itr->language) == booksLanguagesMap.end()) { - booksLanguagesMap[itr->language] = true; - booksLanguages.push_back(itr->language); + if (itr->origId.empty()) { + booksLanguagesMap[itr->language] = true; + booksLanguages.push_back(itr->language); + } } } @@ -323,10 +331,12 @@ namespace kiwix { std::map booksCreatorsMap; std::sort(library.books.begin(), library.books.end(), kiwix::Book::sortByCreator); - for ( itr = library.books.begin(); itr != library.books.end(); ++itr ) { + for (itr = library.books.begin(); itr != library.books.end(); ++itr) { if (booksCreatorsMap.find(itr->creator) == booksCreatorsMap.end()) { - booksCreatorsMap[itr->creator] = true; - booksCreators.push_back(itr->creator); + if (itr->origId.empty()) { + booksCreatorsMap[itr->creator] = true; + booksCreators.push_back(itr->creator); + } } } @@ -353,8 +363,10 @@ namespace kiwix { std::sort(library.books.begin(), library.books.end(), kiwix::Book::sortByPublisher); for ( itr = library.books.begin(); itr != library.books.end(); ++itr ) { if (booksPublishersMap.find(itr->publisher) == booksPublishersMap.end()) { - booksPublishersMap[itr->publisher] = true; - booksPublishers.push_back(itr->publisher); + if (itr->origId.empty()) { + booksPublishersMap[itr->publisher] = true; + booksPublishers.push_back(itr->publisher); + } } } diff --git a/src/common/kiwix/manager.h b/src/common/kiwix/manager.h index 6253eece..6190a1c6 100644 --- a/src/common/kiwix/manager.h +++ b/src/common/kiwix/manager.h @@ -40,7 +40,7 @@ namespace kiwix { enum supportedListSortBy { TITLE, SIZE, DATE, CREATOR, PUBLISHER }; class Manager { - + public: Manager(); ~Manager(); @@ -55,9 +55,9 @@ namespace kiwix { string getCurrentBookId(); bool setBookIndex(const string id, const string path, const supportedIndexType type); bool setBookPath(const string id, const string path); - string addBookFromPathAndGetId(const string pathToOpen, const string pathToSave = "", const string url = "", + string addBookFromPathAndGetId(const string pathToOpen, const string pathToSave = "", const string url = "", const bool checkMetaData = false); - bool addBookFromPath(const string pathToOpen, const string pathToSave = "", const string url = "", + bool addBookFromPath(const string pathToOpen, const string pathToSave = "", const string url = "", const bool checkMetaData = false); Library cloneLibrary(); bool getBookById(const string id, Book &book); @@ -65,7 +65,7 @@ namespace kiwix { unsigned int getBookCount(const bool localBooks, const bool remoteBooks); bool updateBookLastOpenDateById(const string id); void removeBookPaths(); - bool listBooks(const supportedListMode mode, const supportedListSortBy sortBy, const unsigned int maxSize, + bool listBooks(const supportedListMode mode, const supportedListSortBy sortBy, const unsigned int maxSize, const string language, const string creator, const string publisher, const string search); vector getBooksLanguages(); vector getBooksCreators(); @@ -75,10 +75,10 @@ namespace kiwix { string writableLibraryPath; vector bookIdList; - + protected: kiwix::Library library; - + bool readBookFromPath(const string path, Book *book = NULL); bool parseXmlDom(const pugi::xml_document &doc, const bool readOnly, const string libraryPath); diff --git a/src/common/kiwix/reader.cpp b/src/common/kiwix/reader.cpp index 2a928c08..8a6ece16 100644 --- a/src/common/kiwix/reader.cpp +++ b/src/common/kiwix/reader.cpp @@ -19,6 +19,38 @@ #include "reader.h" +inline char hi(char v) { + char hex[] = "0123456789abcdef"; + return hex[(v >> 4) & 0xf]; +} + +inline char lo(char v) { + char hex[] = "0123456789abcdef"; + return hex[v & 0xf]; +} + +std::string hexUUID (std::string in) { + std::ostringstream out; + for (unsigned n = 0; n < 4; ++n) + out << hi(in[n]) << lo(in[n]); + out << '-'; + for (unsigned n = 4; n < 6; ++n) + out << hi(in[n]) << lo(in[n]); + out << '-'; + for (unsigned n = 6; n < 8; ++n) + out << hi(in[n]) << lo(in[n]); + out << '-'; + for (unsigned n = 8; n < 10; ++n) + out << hi(in[n]) << lo(in[n]); + out << '-'; + for (unsigned n = 10; n < 16; ++n) + out << hi(in[n]) << lo(in[n]); + std::string op=out.str(); + return op; +} + + + static char charFromHex(std::string a) { std::istringstream Blat (a); int Z; @@ -28,9 +60,10 @@ static char charFromHex(std::string a) { void unescapeUrl(string &url) { std::string::size_type pos = 0; - while ((pos = url.find('%', pos + 1)) != std::string::npos && - pos + 3 <= url.length()) { + while ((pos = url.find('%', pos)) != std::string::npos && + pos + 2 < url.length()) { url.replace(pos, 3, 1, charFromHex(url.substr(pos + 1, 2))); + ++pos; } return; } @@ -38,14 +71,14 @@ void unescapeUrl(string &url) { namespace kiwix { /* Constructor */ - Reader::Reader(const string zimFilePath) + Reader::Reader(const string zimFilePath) : zimFileHandler(NULL) { string tmpZimFilePath = zimFilePath; /* Remove potential trailing zimaa */ size_t found = tmpZimFilePath.rfind("zimaa"); - if (found != string::npos && - tmpZimFilePath.size() > 5 && + if (found != string::npos && + tmpZimFilePath.size() > 5 && found == tmpZimFilePath.size() - 5) { tmpZimFilePath.resize(tmpZimFilePath.size() - 2); } @@ -63,7 +96,7 @@ namespace kiwix { /* initialize random seed: */ srand ( time(NULL) ); } - + /* Destructor */ Reader::~Reader() { if (this->zimFileHandler != NULL) { @@ -74,7 +107,7 @@ namespace kiwix { zim::File* Reader::getZimFileHandler() { return this->zimFileHandler; } - + /* Reset the cursor for GetNextArticle() */ void Reader::reset() { this->currentArticleOffset = this->firstArticleOffset; @@ -101,12 +134,12 @@ namespace kiwix { return counters; } - + /* Get the count of articles which can be indexed/displayed */ unsigned int Reader::getArticleCount() { std::map counterMap = this->parseCounterMetadata(); unsigned int counter = 0; - + if (counterMap.empty()) { counter = this->nsACount; } else { @@ -114,7 +147,7 @@ namespace kiwix { if (it != counterMap.end()) counter = it->second; } - + return counter; } @@ -140,10 +173,10 @@ namespace kiwix { if (it != counterMap.end()) counter += it->second; } - + return counter; } - + /* Get the total of all items of a ZIM file, redirects included */ unsigned int Reader::getGlobalCount() { return this->zimFileHandler->getCountArticles(); @@ -155,7 +188,7 @@ namespace kiwix { s << this->zimFileHandler->getFileheader().getUuid(); return s.str(); } - + /* Return a page url from a title */ bool Reader::getPageUrlFromTitle(const string &title, string &url) { /* Extract the content from the zim file */ @@ -163,7 +196,7 @@ namespace kiwix { /* Test if the article was found */ if (resultPair.first == true) { - + /* Get the article */ zim::Article article = *resultPair.second; @@ -172,7 +205,7 @@ namespace kiwix { while (article.isRedirect() && loopCounter++<42) { article = article.getRedirectArticle(); } - + url = article.getLongUrl(); return true; } @@ -182,53 +215,53 @@ namespace kiwix { /* Return an URL from a title*/ string Reader::getRandomPageUrl() { - zim::size_type idx = this->firstArticleOffset + - (zim::size_type)((double)rand() / ((double)RAND_MAX + 1) * this->nsACount); + zim::size_type idx = this->firstArticleOffset + + (zim::size_type)((double)rand() / ((double)RAND_MAX + 1) * this->nsACount); zim::Article article = zimFileHandler->getArticle(idx); return article.getLongUrl().c_str(); } - + /* Return the welcome page URL */ string Reader::getMainPageUrl() { string url = ""; - + if (this->zimFileHandler->getFileheader().hasMainPage()) { zim::Article article = zimFileHandler->getArticle(this->zimFileHandler->getFileheader().getMainPage()); url = article.getLongUrl(); if (url.empty()) { - url = getFirstPageUrl(); + url = getFirstPageUrl(); } } else { - url = getFirstPageUrl(); + url = getFirstPageUrl(); } - + return url; } - + bool Reader::getFavicon(string &content, string &mimeType) { unsigned int contentLength = 0; - - this->getContentByUrl( "/-/favicon.png", content, + + this->getContentByUrl( "/-/favicon.png", content, contentLength, mimeType); - + if (content.empty()) { - this->getContentByUrl( "/I/favicon.png", content, + this->getContentByUrl( "/I/favicon.png", content, contentLength, mimeType); if (content.empty()) { - this->getContentByUrl( "/I/favicon", content, + this->getContentByUrl( "/I/favicon", content, contentLength, mimeType); - + if (content.empty()) { - this->getContentByUrl( "/-/favicon", content, + this->getContentByUrl( "/-/favicon", content, contentLength, mimeType); } } } - + return content.empty() ? false : true; } @@ -236,11 +269,11 @@ namespace kiwix { bool Reader::getMetatag(const string &name, string &value) { unsigned int contentLength = 0; string contentType = ""; - - return this->getContentByUrl( "/M/" + name, value, + + return this->getContentByUrl( "/M/" + name, value, contentLength, contentType); } - + string Reader::getTitle() { string value; this->getMetatag("Title", value); @@ -256,7 +289,7 @@ namespace kiwix { string Reader::getDescription() { string value; this->getMetatag("Description", value); - + /* Mediawiki Collection tends to use the "Subtitle" name */ if (value.empty()) { this->getMetatag("Subtitle", value); @@ -289,34 +322,61 @@ namespace kiwix { return value; } + string Reader::getOrigId() { + string value; + this->getMetatag("startfileuid", value); + if(value.empty()) + return ""; + std::string id=value; + std::string origID; + std::string temp=""; + unsigned int k=0; + char tempArray[16]=""; + for(unsigned int i=0; igetNamespaceBeginOffset('A'); zim::Article article = zimFileHandler->getArticle(firstPageOffset); url = article.getLongUrl(); - + return url; } - + bool Reader::parseUrl(const string &url, char *ns, string &title) { /* Offset to visit the url */ unsigned int urlLength = url.size(); unsigned int offset = 0; - + /* Ignore the '/' */ while ((offset < urlLength) && (url[offset] == '/')) offset++; - + /* Get namespace */ while ((offset < urlLength) && (url[offset] != '/')) { *ns= url[offset]; offset++; } - + /* Ignore the '/' */ - while ((offset < urlLength) && (url[offset] == '/')) offset++; - + while ((offset < urlLength) && (url[offset] == '/')) offset++; + /* Get content title */ unsigned int titleOffset = offset; while (offset < urlLength) { @@ -338,7 +398,7 @@ namespace kiwix { contentLength = 0; if (this->zimFileHandler != NULL) { - + /* Parse the url */ char ns = 0; string titleStr; @@ -348,68 +408,72 @@ namespace kiwix { if (titleStr.empty() && ns == 0) { this->parseUrl(this->getMainPageUrl(), &ns, titleStr); } - + /* Extract the content from the zim file */ std::pair resultPair = zimFileHandler->findx(ns, titleStr); - + /* Test if the article was found */ if (resultPair.first == true) { - + /* Get the article */ zim::Article article = zimFileHandler->getArticle(resultPair.second.getIndex()); - + /* If redirect */ unsigned int loopCounter = 0; while (article.isRedirect() && loopCounter++<42) { article = article.getRedirectArticle(); } - + /* Get the content mime-type */ - contentType = string(article.getMimeType().data(), article.getMimeType().size()); - + contentType = string(article.getMimeType().data(), article.getMimeType().size()); + /* Get the data */ content = string(article.getData().data(), article.getArticleSize()); - + /* Try to set a stub HTML header/footer if necesssary */ if (contentType == "text/html" && std::string::npos == content.find("")) { content = "" + article.getTitle() + "" + content + ""; } - + /* Get the data length */ contentLength = article.getArticleSize(); - + /* Set return value */ retVal = true; } } - + return retVal; } - + /* Search titles by prefix */ bool Reader::searchSuggestions(const string &prefix, unsigned int suggestionsCount, const bool reset) { bool retVal = false; - zim::File::const_iterator articleItr; + zim::File::const_iterator articleItr; std::vector::iterator suggestionItr; int result; - /* Reset the suggestions */ + /* Reset the suggestions otherwise check if the suggestions number is less than the suggestionsCount */ if (reset) { this->suggestions.clear(); + } else { + if (this->suggestions.size() > suggestionsCount) { + return false; + } } if (prefix.size()) { for (articleItr = zimFileHandler->findByTitle('A', prefix); - articleItr != zimFileHandler->end() && - articleItr->getTitle().compare(0, prefix.size(), prefix) == 0 && - this->suggestions.size() < suggestionsCount ; + articleItr != zimFileHandler->end() && + articleItr->getTitle().compare(0, prefix.size(), prefix) == 0 && + this->suggestions.size() < suggestionsCount ; ++articleItr) { if (this->suggestions.size() == 0) { this->suggestions.push_back(articleItr->getTitle()); - } else { - for (suggestionItr = this->suggestions.begin() ; - suggestionItr != this->suggestions.end(); + } else if (this->suggestions.size() < suggestionsCount) { + for (suggestionItr = this->suggestions.begin() ; + suggestionItr != this->suggestions.end(); ++suggestionItr) { result = articleItr->getTitle().compare(*suggestionItr); @@ -425,25 +489,25 @@ namespace kiwix { this->suggestions.push_back(articleItr->getTitle()); } } - + /* Suggestions where found */ retVal = true; } } - + /* Set the cursor to the begining */ this->suggestionsOffset = this->suggestions.begin(); - + return retVal; } - + /* Try also a few variations of the prefix to have better results */ bool Reader::searchSuggestionsSmart(const string &prefix, unsigned int suggestionsCount) { std::string myPrefix = prefix; /* Normal suggestion request */ bool retVal = this->searchSuggestions(prefix, suggestionsCount, true); - + /* Try with first letter uppercase */ myPrefix = kiwix::ucFirst(myPrefix); this->searchSuggestions(myPrefix, suggestionsCount, false); @@ -452,6 +516,10 @@ namespace kiwix { myPrefix = kiwix::lcFirst(myPrefix); this->searchSuggestions(myPrefix, suggestionsCount, false); + /* Try with title words */ + myPrefix = kiwix::toTitle(myPrefix); + this->searchSuggestions(myPrefix, suggestionsCount, false); + return retVal; } @@ -460,10 +528,10 @@ namespace kiwix { if (this->suggestionsOffset != this->suggestions.end()) { /* title */ title = *(this->suggestionsOffset); - + /* increment the cursor for the next call */ this->suggestionsOffset++; - + return true; } @@ -492,7 +560,7 @@ namespace kiwix { unsigned int Reader::getFileSize() { zim::File *file = this->getZimFileHandler(); zim::offset_type size = 0; - + if (file != NULL) { size = file->getFilesize(); } diff --git a/src/common/kiwix/reader.h b/src/common/kiwix/reader.h index dec5c8be..f4458c71 100644 --- a/src/common/kiwix/reader.h +++ b/src/common/kiwix/reader.h @@ -38,7 +38,7 @@ using namespace std; namespace kiwix { class Reader { - + public: Reader(const string zimFilePath); ~Reader(); @@ -58,6 +58,7 @@ namespace kiwix { string getDate(); string getCreator(); string getPublisher(); + string getOrigId(); bool getFavicon(string &content, string &mimeType); bool getPageUrlFromTitle(const string &title, string &url); bool getContentByUrl(const string &url, string &content, unsigned int &contentLength, string &contentType); @@ -69,7 +70,7 @@ namespace kiwix { bool parseUrl(const string &url, char *ns, string &title); unsigned int getFileSize(); zim::File* getZimFileHandler(); - + protected: zim::File* zimFileHandler; zim::size_type firstArticleOffset; @@ -77,7 +78,7 @@ namespace kiwix { zim::size_type currentArticleOffset; zim::size_type nsACount; zim::size_type nsICount; - + std::vector suggestions; std::vector::iterator suggestionsOffset; diff --git a/src/common/stringTools.cpp b/src/common/stringTools.cpp index cdc07c28..1553c5ba 100644 --- a/src/common/stringTools.cpp +++ b/src/common/stringTools.cpp @@ -174,36 +174,40 @@ std::string kiwix::ucFirst (const std::string &word) { if (word.empty()) return ""; - std::string ucFirstWord; + std::string result; -#ifdef __ANDROID__ - ucFirstWord = word; - ucFirstWord[0] = toupper(ucFirstWord[0]); -#else - UnicodeString firstLetter = UnicodeString(word.substr(0, 1).c_str()); - UnicodeString ucFirstLetter = firstLetter.toUpper(); - ucFirstLetter.toUTF8String(ucFirstWord); - ucFirstWord += word.substr(1); -#endif + UnicodeString unicodeWord(word.c_str()); + UnicodeString unicodeFirstLetter = unicodeWord.tempSubString(0, 1).toUpper(); + unicodeWord.replace(0, 1, unicodeFirstLetter); + unicodeWord.toUTF8String(result); - return ucFirstWord; + return result; } std::string kiwix::lcFirst (const std::string &word) { if (word.empty()) return ""; - std::string ucFirstWord; + std::string result; -#ifdef __ANDROID__ - ucFirstWord = word; - ucFirstWord[0] = tolower(ucFirstWord[0]); -#else - UnicodeString firstLetter = UnicodeString(word.substr(0, 1).c_str()); - UnicodeString ucFirstLetter = firstLetter.toLower(); - ucFirstLetter.toUTF8String(ucFirstWord); - ucFirstWord += word.substr(1); -#endif + UnicodeString unicodeWord(word.c_str()); + UnicodeString unicodeFirstLetter = unicodeWord.tempSubString(0, 1).toLower(); + unicodeWord.replace(0, 1, unicodeFirstLetter); + unicodeWord.toUTF8String(result); - return ucFirstWord; + return result; +} + + +std::string kiwix::toTitle (const std::string &word) { + if (word.empty()) + return ""; + + std::string result; + + UnicodeString unicodeWord(word.c_str()); + unicodeWord = unicodeWord.toTitle(0); + unicodeWord.toUTF8String(result); + + return result; } diff --git a/src/common/stringTools.h b/src/common/stringTools.h index 47cbc6f9..a3f6da71 100644 --- a/src/common/stringTools.h +++ b/src/common/stringTools.h @@ -20,7 +20,6 @@ #ifndef KIWIX_STRINGTOOLS_H #define KIWIX_STRINGTOOLS_H -#ifndef __ANDROID__ #include #include #include @@ -29,7 +28,6 @@ #include #include #include -#endif #include #include @@ -58,6 +56,7 @@ namespace kiwix { std::string ucFirst(const std::string &word); std::string lcFirst(const std::string &word); + std::string toTitle(const std::string &word); } #endif