mirror of
https://github.com/kiwix/libkiwix.git
synced 2025-09-07 22:27:00 -04:00
Fuzzy match url in the server.
This commit is contained in:
parent
8b8a038f1f
commit
e1acac2bc7
@ -1126,6 +1126,7 @@ std::unique_ptr<Response> InternalServer::build_response_for_path(const RequestC
|
||||
}
|
||||
|
||||
auto response = ItemResponse::build(*this, request, entry.getItem());
|
||||
response->set_etag_body(std::string(archive.getUuid()));
|
||||
|
||||
if ( !startsWith(entry.getItem().getMimetype(), "application/pdf") ) {
|
||||
// NOTE: Content security policy is not applied to PDF content so that
|
||||
@ -1137,6 +1138,132 @@ std::unique_ptr<Response> InternalServer::build_response_for_path(const RequestC
|
||||
return response;
|
||||
}
|
||||
|
||||
struct FuzzyRule {
|
||||
std::string match;
|
||||
std::string fuzzyCannonReplace;
|
||||
std::string split;
|
||||
bool splitlast;
|
||||
std::vector<std::vector<std::string>> args;
|
||||
};
|
||||
|
||||
typedef std::vector<FuzzyRule> FuzzyRules;
|
||||
|
||||
|
||||
const FuzzyRules FUZZY_RULES{
|
||||
{
|
||||
/*match:*/ "^(https?://(?:www\\.)?)(youtube\\.com/@[^?]+)[?].*",
|
||||
/*fuzzyCanonReplace:*/ "$1$2",
|
||||
/*.split:*/ "",
|
||||
/*.splitlast:*/ false,
|
||||
/*.args:*/ {},
|
||||
},
|
||||
{
|
||||
/*match:*/ "//(?:www\\.)?youtube(?:-nocookie)?\\.com/(get_video_info)",
|
||||
/*fuzzyCanonReplace": */"//youtube.fuzzy.replayweb.page/$1",
|
||||
/*split:*/ "",
|
||||
/*slpitlast:*/ false,
|
||||
/*args:*/ {{"video_id"}},
|
||||
},
|
||||
{
|
||||
/*match:*/ "//(?:www\\.)?youtube(?:-nocookie)?\\.com/(youtubei/v1/[^?]+\\?).*(videoId[^&]+).*",
|
||||
/*fuzzyCanonReplace: */ "//youtube.fuzzy.replayweb.page/$1$2",
|
||||
/*.split:*/ "",
|
||||
/*.splitlast:*/ false,
|
||||
/*"args: */ {{"videoId"}},
|
||||
},
|
||||
{
|
||||
/*"match":*/ "//.*googlevideo.com/(videoplayback)",
|
||||
/*fuzzyCanonReplace": */"//youtube.fuzzy.replayweb.page/$1",
|
||||
/*.split:*/ "",
|
||||
/*.splitlast:*/ false,
|
||||
/*"args": */ {
|
||||
{"id", "itag"},
|
||||
{"id"}
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
FuzzyRule get_rule(std::string path) {
|
||||
if (path.find("?") == std::string::npos) {
|
||||
path.append("?");
|
||||
}
|
||||
|
||||
for ( const auto& fuzzy_rule : FUZZY_RULES ) {
|
||||
std::cout << "try to match " << fuzzy_rule.match << std::endl;
|
||||
if (matchRegex(path, fuzzy_rule.match)) {
|
||||
return fuzzy_rule;
|
||||
}
|
||||
}
|
||||
throw std::runtime_error("No Rule");
|
||||
}
|
||||
|
||||
std::vector<std::string> gen_fuzzy_urls(const RequestContext& request, const std::string& path) {
|
||||
std::vector<std::string> fuzzy_urls;
|
||||
// First of all, add the query_string
|
||||
auto url_queried = path + "?" + request.get_query();
|
||||
fuzzy_urls.push_back(url_queried);
|
||||
try {
|
||||
auto rule = get_rule(url_queried);
|
||||
|
||||
std::cout << "Matching rule : " << rule.match << std::endl;
|
||||
|
||||
std::string sep = rule.split.size() != 0 ? rule.split : "?";
|
||||
auto split_idx = rule.splitlast ? url_queried.rfind(sep) : url_queried.find(sep);
|
||||
auto prefix = split_idx == std::string::npos ? url_queried: url_queried.substr(0, split_idx+sep.size());
|
||||
|
||||
std::cout << "Prefix is : " << prefix << std::endl;
|
||||
|
||||
std::string fuzzy_cannon_url;
|
||||
if (rule.fuzzyCannonReplace.size() != 0) {
|
||||
std::cout << "replace " << rule.match << " with " << rule.fuzzyCannonReplace << std::endl;
|
||||
fuzzy_cannon_url = replaceRegex(url_queried, rule.fuzzyCannonReplace, rule.match);
|
||||
} else {
|
||||
fuzzy_cannon_url = prefix;
|
||||
}
|
||||
|
||||
// remove querystring from fuzzy_cannon_url.
|
||||
split_idx = fuzzy_cannon_url.find("?");
|
||||
fuzzy_cannon_url = fuzzy_cannon_url.substr(0, split_idx);
|
||||
|
||||
std::cout << "fuzzy_cannon_url is : " << fuzzy_cannon_url << std::endl;
|
||||
|
||||
fuzzy_urls.push_back(fuzzy_cannon_url);
|
||||
|
||||
for (auto args: rule.args) {
|
||||
std::stringstream query;
|
||||
std::string sep="?";
|
||||
for (auto arg: args) {
|
||||
query << sep << arg << "=" << request.get_optional_param(arg, std::string());
|
||||
sep = "&";
|
||||
}
|
||||
fuzzy_urls.push_back(fuzzy_cannon_url+query.str());
|
||||
}
|
||||
} catch(const std::runtime_error&) {
|
||||
auto split_idx = url_queried.find("?");
|
||||
fuzzy_urls.push_back(split_idx == std::string::npos ? url_queried : url_queried.substr(0, split_idx+1));
|
||||
}
|
||||
return fuzzy_urls;
|
||||
}
|
||||
|
||||
std::unique_ptr<Response> InternalServer::build_response_for_fuzzypath(const RequestContext& request, const zim::Archive& archive, const std::string& bookName, const std::string& path) const
|
||||
{
|
||||
try {
|
||||
std::cout << "Try url : " << path << std::endl;
|
||||
return build_response_for_path(request, archive, bookName, path);
|
||||
} catch(zim::EntryNotFound& e) {
|
||||
// We have to do fuzzy matching.
|
||||
for (const auto& fuzzy_url: gen_fuzzy_urls(request, path)) {
|
||||
std::cout << "Try fuzzy url : " << fuzzy_url << std::endl;
|
||||
try {
|
||||
return build_response_for_path(request, archive, bookName, fuzzy_url);
|
||||
} catch(zim::EntryNotFound& e) {}
|
||||
}
|
||||
// No fuzzy path matches,
|
||||
std::cout << "Not found, sorry.." << std::endl;
|
||||
throw zim::EntryNotFound("No fuzzy rule matches.");
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<Response> InternalServer::handle_content(const RequestContext& request)
|
||||
{
|
||||
const std::string url = request.get_url();
|
||||
@ -1174,8 +1301,7 @@ std::unique_ptr<Response> InternalServer::handle_content(const RequestContext& r
|
||||
}
|
||||
|
||||
try {
|
||||
auto response = build_response_for_path(request, *archive, bookName, urlStr);
|
||||
response->set_etag_body(archiveUuid);
|
||||
auto response = build_response_for_fuzzypath(request, *archive, bookName, urlStr);
|
||||
return response;
|
||||
} catch(zim::EntryNotFound& e) {
|
||||
if (m_verbose.load())
|
||||
|
@ -124,6 +124,7 @@ class InternalServer {
|
||||
std::unique_ptr<Response> build_redirect(const std::string& bookName, const zim::Item& item) const;
|
||||
std::unique_ptr<Response> build_homepage(const RequestContext& request);
|
||||
std::unique_ptr<Response> build_response_for_path(const RequestContext& request, const zim::Archive& archive, const std::string& bookName, const std::string& path) const;
|
||||
std::unique_ptr<Response> build_response_for_fuzzypath(const RequestContext& request, const zim::Archive& archive, const std::string& bookName, const std::string& path) const;
|
||||
std::unique_ptr<Response> handle_viewer_settings(const RequestContext& request);
|
||||
std::unique_ptr<Response> handle_skin(const RequestContext& request);
|
||||
std::unique_ptr<Response> handle_catalog(const RequestContext& request);
|
||||
|
@ -122,6 +122,12 @@ zim::Entry getEntryFromPath(const zim::Archive& archive, const std::string& path
|
||||
if (path.empty() || path == "/") {
|
||||
return archive.getMainEntry();
|
||||
}
|
||||
std::cout << "Search for H/"<<path << std::endl;
|
||||
auto entry = archive.getEntryByPath("H/"+path);
|
||||
while (entry.isRedirect()) {
|
||||
entry = entry.getRedirectEntry();
|
||||
}
|
||||
return entry;
|
||||
}
|
||||
throw zim::EntryNotFound("Cannot find entry for non empty path");
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user