mirror of
https://github.com/kiwix/kiwix-js-pwa.git
synced 2025-09-09 04:06:27 -04:00
Preliminary support for ZIMs with no namespace
Former-commit-id: 48b08080315e614bc83232b38f7ad4635cf49028 [formerly 25819217876aea980a41ce62ea1f526d6ec8a5ce [formerly d7f4153def08652e4bc75f9a142d96a94c23f2d4]] Former-commit-id: 529a316522cf28e2752955fbac95eccee4759440 Former-commit-id: f34f63004066c50c216d7376a1e3fc0a93ed5329
This commit is contained in:
parent
a73395a0d6
commit
58eb005859
@ -12,7 +12,7 @@ const regexpKiwixDownloadLinks = /download\.kiwix\.org/i;
|
||||
|
||||
// Pattern for ZIM file namespace - see https://wiki.openzim.org/wiki/ZIM_file_format#Namespaces
|
||||
// In our case, there is also the ZIM file name, used as a prefix in the URL
|
||||
const regexpZIMUrlWithNamespace = /(?:^|\/)([^\/]+\/)([-ABIJMUVWX])\/(.+)/;
|
||||
const regexpZIMUrlWithNamespace = /(?:^|\/)([^\/]+\/)([-ABCIJMUVWX])\/(.+)/;
|
||||
|
||||
const CACHE = "kiwix-precache-" + appVersion;
|
||||
const precacheFiles = [
|
||||
|
@ -3075,14 +3075,14 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'util', 'cache', 'images', 'sett
|
||||
// Pattern to find the path in a url
|
||||
var regexpPath = /^(.*\/)[^\/]+$/;
|
||||
// Pattern to find a ZIM URL (with its namespace) - see https://wiki.openzim.org/wiki/ZIM_file_format#Namespaces
|
||||
var regexpZIMUrlWithNamespace = /^[.\/]*([-ABIJMUVWX]\/.+)$/;
|
||||
// Regex below finds images, scripts, and stylesheets with ZIM-type metadata and image namespaces [kiwix-js #378]
|
||||
// It first searches for <img, <script, <link, etc., then scans forward to find, on a word boundary, either src=["']
|
||||
// or href=["'] (ignoring any extra whitespace), and it then tests the path of the URL with a non-capturing lookahead that
|
||||
// matches ZIM URLs with namespaces [-IJ] ('-' = metadata or 'I'/'J' = image). When the regex is used below, it will also
|
||||
// remove any relative or absolute path from ZIM-style URLs.
|
||||
// DEV: If you want to support more namespaces, add them to the END of the character set [-IJ] (not to the beginning)
|
||||
var regexpTagsWithZimUrl = /(<(?:img|script|link)\b[^>]*?\s)(?:src|href)(\s*=\s*["'])(?:\.\.\/|\/)+(?=[-IJ]\/)/ig;
|
||||
var regexpZIMUrlWithNamespace = /^[./]*([-ABCIJMUVWX]\/.+)$/;
|
||||
// Regex below finds images, scripts, stylesheets and tracks with ZIM-type metadata and image namespaces [kiwix-js #378].
|
||||
// It first searches for <img, <script, <link, etc., then scans forward to find, on a word boundary, either src=["'] or href=["']
|
||||
// (ignoring any extra whitespace), and it then tests the path of the URL with a non-capturing negative lookahead that excludes
|
||||
// URLs that begin 'http' (i.e. non-relative URLs). It then captures the whole of the URL up until either the opening delimiter
|
||||
// (" or ', which is capture group \3) or a querystring or hash character (? or #). When the regex is used below, it will be further
|
||||
// processed to calculate the ZIM URL from the relative path. This regex can cope with legitimate single quote marks (') in the URL.
|
||||
var regexpTagsWithZimUrl = /(<(?:img|script|link|track)\b[^>]*?\s)(?:src|href)(\s*=\s*(["']))(?!http)(.+?)(?=\3|\?|#)/ig;
|
||||
// Regex below tests the html of an article for active content [kiwix-js #466]
|
||||
// It inspects every <script> block in the html and matches in the following cases: 1) the script loads a UI application called app.js;
|
||||
// 2) the script block has inline content that does not contain "importScript()", "toggleOpenSection" or an "articleId" assignment
|
||||
@ -3148,8 +3148,17 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'util', 'cache', 'images', 'sett
|
||||
params.htmlArticle = htmlArticle;
|
||||
|
||||
// Replaces ZIM-style URLs of img, script, link and media tags with a data-kiwixurl to prevent 404 errors [kiwix-js #272 #376]
|
||||
// This replacement also processes the URL to remove the path so that the URL is ready for subsequent jQuery functions
|
||||
if (params.contentInjectionMode == 'jquery') htmlArticle = htmlArticle.replace(regexpTagsWithZimUrl, '$1data-kiwixurl$2');
|
||||
// This replacement also processes the URL relative to the page's ZIM URL so that we can find the ZIM URL of the asset
|
||||
// with the correct namespace (this works for old-style -,I,J namespaces and for new-style C namespace)
|
||||
if (params.contentInjectionMode == 'jquery') {
|
||||
htmlArticle = htmlArticle.replace(regexpTagsWithZimUrl, function(match, blockStart, equals, quote, relAssetUrl) {
|
||||
var assetZIMUrl = uiUtil.deriveZimUrlFromRelativeUrl(relAssetUrl, baseUrl);
|
||||
// DEV: Note that deriveZimUrlFromRelativeUrl produces a *decoded* URL (and incidentally would remove any URI component
|
||||
// if we had captured it). We therefore re-encode the URI with encodeURI (which does not encode forward slashes) instead
|
||||
// of encodeURIComponent.
|
||||
return blockStart + 'data-kiwixurl' + equals + encodeURI(assetZIMUrl);
|
||||
}
|
||||
}
|
||||
// Remove any empty media containers on page
|
||||
htmlArticle = htmlArticle.replace(/(<(audio|video)\b(?:[^<]|<(?!\/\2))+<\/\2>)/ig, function (p0) {
|
||||
return /(?:src|data-kiwixurl)\s*=\s*["']/.test(p0) ? p0 : '';
|
||||
@ -4010,8 +4019,10 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'util', 'cache', 'images', 'sett
|
||||
document.getElementById('searchingArticles').style.display = 'none';
|
||||
uiUtil.systemAlert("Error finding random article.");
|
||||
} else {
|
||||
//Test below supports Stackexchange-family ZIMs, so we don't call up user profiles
|
||||
if (dirEntry.namespace === 'A' && !/user\//.test(dirEntry.url)) {
|
||||
// We fall back to the old A namespace to support old ZIM files without a text/html MIME type for articles
|
||||
// DEV: This will need to be changed if we search titlePtrList version 1
|
||||
// in a future PR, as that list contains only articles
|
||||
if (dirEntry.getMimetype() === 'text/html' || dirEntry.namespace === 'A') {
|
||||
params.isLandingPage = false;
|
||||
$('#activeContent').hide();
|
||||
readArticle(dirEntry);
|
||||
@ -4032,7 +4043,8 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'util', 'cache', 'images', 'sett
|
||||
document.getElementById('searchingArticles').style.display = 'none';
|
||||
$("#welcomeText").show();
|
||||
} else {
|
||||
if (dirEntry.namespace === 'A') {
|
||||
// DEV: see comment above under goToRandomArticle()
|
||||
if (dirEntry.redirect || dirEntry.getMimetype() === 'text/html' || dirEntry.namespace === 'A') {
|
||||
params.isLandingPage = true;
|
||||
readArticle(dirEntry);
|
||||
} else {
|
||||
|
@ -4,7 +4,7 @@ define([], function () {
|
||||
* settingsStore.js
|
||||
*
|
||||
* A reader/writer framework for cookies or localStorage with full unicode support based on the Mozilla cookies framework.
|
||||
* The Mozilla code has been adapted to test for the availability of the localStorage API, and to use it in preference to settingsStore.
|
||||
* The Mozilla code has been adapted to test for the availability of the localStorage API, and to use it in preference to cookies.
|
||||
*
|
||||
* Mozilla version information:
|
||||
*
|
||||
|
@ -273,9 +273,9 @@ define(rqDef, function() {
|
||||
* Derives the URL.pathname from a relative or semi-relative URL using the given base ZIM URL
|
||||
*
|
||||
* @param {String} url The (URI-encoded) URL to convert (e.g. "Einstein", "../Einstein",
|
||||
* "../../I/im%C3%A1gen.png", "-/s/style.css", "/A/Einstein.html")
|
||||
* @param {String} base The base ZIM URL of the currently loaded article (e.g. "A/" or "A/subdir1/subdir2/")
|
||||
* @returns {String} The derived ZIM URL in decoded form (e.g. "A/Einstein", "I/imágen.png")
|
||||
* "../../I/im%C3%A1gen.png", "-/s/style.css", "/A/Einstein.html", "../static/bootstrap/css/bootstrap.min.css")
|
||||
* @param {String} base The base ZIM URL of the currently loaded article (e.g. "A/", "A/subdir1/subdir2/", "C/Singapore/")
|
||||
* @returns {String} The derived ZIM URL in decoded form (e.g. "A/Einstein", "I/imágen.png", "C/")
|
||||
*/
|
||||
function deriveZimUrlFromRelativeUrl(url, base) {
|
||||
// We use a dummy domain because URL API requires a valid URI
|
||||
|
@ -210,6 +210,30 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
|
||||
searchNextVariant();
|
||||
};
|
||||
|
||||
/**
|
||||
* A method to return the namespace in the ZIM file that contains the primary user content. In old-format ZIM files (minor
|
||||
* version 0) there are a number of content namespaces, but the primary one in which to search for titles is 'A'. In new-format
|
||||
* ZIMs (minor version 1) there is a single content namespace 'C'. See https://openzim.org/wiki/ZIM_file_format. This method
|
||||
* throws an error if it cannot determine the namespace or if the ZIM is not ready.
|
||||
* @returns {String} The content namespace for the ZIM archive
|
||||
*/
|
||||
ZIMArchive.prototype.getContentNamespace = function () {
|
||||
var errorText;
|
||||
if (this.isReady()) {
|
||||
var ver = this._file.minorVersion;
|
||||
// DEV: There are currently only two defined values for minorVersion in the OpenZIM specification
|
||||
// If this changes, adapt the error checking and return values
|
||||
if (ver > 1) {
|
||||
errorText = 'Unknown ZIM minor version!';
|
||||
} else {
|
||||
return ver === 0 ? 'A' : 'C';
|
||||
}
|
||||
} else {
|
||||
errorText = 'We could not determine the content namespace because the ZIM file is not ready!';
|
||||
}
|
||||
throw new Error(errorText);
|
||||
};
|
||||
|
||||
/**
|
||||
* Look for dirEntries with title starting with the given prefix (case-sensitive)
|
||||
*
|
||||
@ -226,12 +250,14 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
|
||||
startIndex = startIndex || 0;
|
||||
prefix = prefix || '';
|
||||
var that = this;
|
||||
var cns = this.getContentNamespace();
|
||||
util.binarySearch(startIndex, this._file.articleCount, function(i) {
|
||||
return that._file.dirEntryByTitleIndex(i).then(function(dirEntry) {
|
||||
if (search.status === 'cancelled') return 0;
|
||||
if (dirEntry.namespace < 'A') return 1;
|
||||
if (dirEntry.namespace > 'A') return -1;
|
||||
// We should now be in namespace A
|
||||
var ns = dirEntry.namespace;
|
||||
if (ns < cns) return 1;
|
||||
if (ns > cns) return -1;
|
||||
// We should now be in namespace A (old format ZIM) or C (new format ZIM)
|
||||
return prefix <= dirEntry.getTitleOrUrl() ? -1 : 1;
|
||||
});
|
||||
}, true).then(function(firstIndex) {
|
||||
@ -246,7 +272,7 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
|
||||
return that._file.dirEntryByTitleIndex(index).then(function(dirEntry) {
|
||||
var title = dirEntry.getTitleOrUrl();
|
||||
// Only return dirEntries with titles that actually begin with prefix
|
||||
if (saveStartIndex === null || dirEntry.namespace === 'A' && title.indexOf(prefix) === 0) {
|
||||
if (saveStartIndex === null || dirEntry.namespace === cns && title.indexOf(prefix) === 0) {
|
||||
dirEntries.push(dirEntry);
|
||||
// Report interim result
|
||||
if (typeof saveStartIndex === 'undefined') callback([dirEntry], index, true);
|
||||
@ -327,9 +353,9 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
|
||||
if (index === null) return null;
|
||||
return that._file.dirEntryByUrlIndex(index);
|
||||
}).then(function(dirEntry) {
|
||||
if ((dirEntry === null || dirEntry === undefined) && /^A\/[^/]+\/.+/i.test(title)) {
|
||||
if ((dirEntry === null || dirEntry === undefined) && /^[AC]\/[^/]+\/.+/i.test(title)) {
|
||||
console.log("Article " + title + " not available, but moving up one directory to compensate for ZIM coding error...");
|
||||
title = title.replace(/^(A\/)[^/]+\/(.+)$/, '$1$2');
|
||||
title = title.replace(/^([AC]\/)[^/]+\/(.+)$/, '$1$2');
|
||||
return that.getDirEntryByTitle(title);
|
||||
} else {
|
||||
return dirEntry;
|
||||
|
@ -300,6 +300,9 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
|
||||
zf.id = tempFileId++;
|
||||
fileIDs.set(zf.name, zf.id);
|
||||
}
|
||||
// For a description of these values, see https://wiki.openzim.org/wiki/ZIM_file_format
|
||||
zf.majorVersion = readInt(header, 4, 2); // Not currently used by this implementation
|
||||
zf.minorVersion = readInt(header, 6, 2); // Used to determine the User Content namespace
|
||||
zf.articleCount = readInt(header, 24, 4);
|
||||
zf.clusterCount = readInt(header, 28, 4);
|
||||
zf.urlPtrPos = urlPtrPos;
|
||||
|
Loading…
x
Reference in New Issue
Block a user