Preliminary support for ZIMs with no namespace

Former-commit-id: 48b08080315e614bc83232b38f7ad4635cf49028 [formerly 25819217876aea980a41ce62ea1f526d6ec8a5ce [formerly d7f4153def08652e4bc75f9a142d96a94c23f2d4]]
Former-commit-id: 529a316522cf28e2752955fbac95eccee4759440
Former-commit-id: f34f63004066c50c216d7376a1e3fc0a93ed5329
This commit is contained in:
Jaifroid 2021-02-06 18:03:03 +00:00
parent a73395a0d6
commit 58eb005859
6 changed files with 65 additions and 24 deletions

View File

@ -12,7 +12,7 @@ const regexpKiwixDownloadLinks = /download\.kiwix\.org/i;
// Pattern for ZIM file namespace - see https://wiki.openzim.org/wiki/ZIM_file_format#Namespaces // Pattern for ZIM file namespace - see https://wiki.openzim.org/wiki/ZIM_file_format#Namespaces
// In our case, there is also the ZIM file name, used as a prefix in the URL // In our case, there is also the ZIM file name, used as a prefix in the URL
const regexpZIMUrlWithNamespace = /(?:^|\/)([^\/]+\/)([-ABIJMUVWX])\/(.+)/; const regexpZIMUrlWithNamespace = /(?:^|\/)([^\/]+\/)([-ABCIJMUVWX])\/(.+)/;
const CACHE = "kiwix-precache-" + appVersion; const CACHE = "kiwix-precache-" + appVersion;
const precacheFiles = [ const precacheFiles = [

View File

@ -3075,14 +3075,14 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'util', 'cache', 'images', 'sett
// Pattern to find the path in a url // Pattern to find the path in a url
var regexpPath = /^(.*\/)[^\/]+$/; var regexpPath = /^(.*\/)[^\/]+$/;
// Pattern to find a ZIM URL (with its namespace) - see https://wiki.openzim.org/wiki/ZIM_file_format#Namespaces // Pattern to find a ZIM URL (with its namespace) - see https://wiki.openzim.org/wiki/ZIM_file_format#Namespaces
var regexpZIMUrlWithNamespace = /^[.\/]*([-ABIJMUVWX]\/.+)$/; var regexpZIMUrlWithNamespace = /^[./]*([-ABCIJMUVWX]\/.+)$/;
// Regex below finds images, scripts, and stylesheets with ZIM-type metadata and image namespaces [kiwix-js #378] // Regex below finds images, scripts, stylesheets and tracks with ZIM-type metadata and image namespaces [kiwix-js #378].
// It first searches for <img, <script, <link, etc., then scans forward to find, on a word boundary, either src=["'] // It first searches for <img, <script, <link, etc., then scans forward to find, on a word boundary, either src=["'] or href=["']
// or href=["'] (ignoring any extra whitespace), and it then tests the path of the URL with a non-capturing lookahead that // (ignoring any extra whitespace), and it then tests the path of the URL with a non-capturing negative lookahead that excludes
// matches ZIM URLs with namespaces [-IJ] ('-' = metadata or 'I'/'J' = image). When the regex is used below, it will also // URLs that begin 'http' (i.e. non-relative URLs). It then captures the whole of the URL up until either the opening delimiter
// remove any relative or absolute path from ZIM-style URLs. // (" or ', which is capture group \3) or a querystring or hash character (? or #). When the regex is used below, it will be further
// DEV: If you want to support more namespaces, add them to the END of the character set [-IJ] (not to the beginning) // processed to calculate the ZIM URL from the relative path. This regex can cope with legitimate single quote marks (') in the URL.
var regexpTagsWithZimUrl = /(<(?:img|script|link)\b[^>]*?\s)(?:src|href)(\s*=\s*["'])(?:\.\.\/|\/)+(?=[-IJ]\/)/ig; var regexpTagsWithZimUrl = /(<(?:img|script|link|track)\b[^>]*?\s)(?:src|href)(\s*=\s*(["']))(?!http)(.+?)(?=\3|\?|#)/ig;
// Regex below tests the html of an article for active content [kiwix-js #466] // Regex below tests the html of an article for active content [kiwix-js #466]
// It inspects every <script> block in the html and matches in the following cases: 1) the script loads a UI application called app.js; // It inspects every <script> block in the html and matches in the following cases: 1) the script loads a UI application called app.js;
// 2) the script block has inline content that does not contain "importScript()", "toggleOpenSection" or an "articleId" assignment // 2) the script block has inline content that does not contain "importScript()", "toggleOpenSection" or an "articleId" assignment
@ -3148,8 +3148,17 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'util', 'cache', 'images', 'sett
params.htmlArticle = htmlArticle; params.htmlArticle = htmlArticle;
// Replaces ZIM-style URLs of img, script, link and media tags with a data-kiwixurl to prevent 404 errors [kiwix-js #272 #376] // Replaces ZIM-style URLs of img, script, link and media tags with a data-kiwixurl to prevent 404 errors [kiwix-js #272 #376]
// This replacement also processes the URL to remove the path so that the URL is ready for subsequent jQuery functions // This replacement also processes the URL relative to the page's ZIM URL so that we can find the ZIM URL of the asset
if (params.contentInjectionMode == 'jquery') htmlArticle = htmlArticle.replace(regexpTagsWithZimUrl, '$1data-kiwixurl$2'); // with the correct namespace (this works for old-style -,I,J namespaces and for new-style C namespace)
if (params.contentInjectionMode == 'jquery') {
htmlArticle = htmlArticle.replace(regexpTagsWithZimUrl, function(match, blockStart, equals, quote, relAssetUrl) {
var assetZIMUrl = uiUtil.deriveZimUrlFromRelativeUrl(relAssetUrl, baseUrl);
// DEV: Note that deriveZimUrlFromRelativeUrl produces a *decoded* URL (and incidentally would remove any URI component
// if we had captured it). We therefore re-encode the URI with encodeURI (which does not encode forward slashes) instead
// of encodeURIComponent.
return blockStart + 'data-kiwixurl' + equals + encodeURI(assetZIMUrl);
}
}
// Remove any empty media containers on page // Remove any empty media containers on page
htmlArticle = htmlArticle.replace(/(<(audio|video)\b(?:[^<]|<(?!\/\2))+<\/\2>)/ig, function (p0) { htmlArticle = htmlArticle.replace(/(<(audio|video)\b(?:[^<]|<(?!\/\2))+<\/\2>)/ig, function (p0) {
return /(?:src|data-kiwixurl)\s*=\s*["']/.test(p0) ? p0 : ''; return /(?:src|data-kiwixurl)\s*=\s*["']/.test(p0) ? p0 : '';
@ -4010,8 +4019,10 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'util', 'cache', 'images', 'sett
document.getElementById('searchingArticles').style.display = 'none'; document.getElementById('searchingArticles').style.display = 'none';
uiUtil.systemAlert("Error finding random article."); uiUtil.systemAlert("Error finding random article.");
} else { } else {
//Test below supports Stackexchange-family ZIMs, so we don't call up user profiles // We fall back to the old A namespace to support old ZIM files without a text/html MIME type for articles
if (dirEntry.namespace === 'A' && !/user\//.test(dirEntry.url)) { // DEV: This will need to be changed if we search titlePtrList version 1
// in a future PR, as that list contains only articles
if (dirEntry.getMimetype() === 'text/html' || dirEntry.namespace === 'A') {
params.isLandingPage = false; params.isLandingPage = false;
$('#activeContent').hide(); $('#activeContent').hide();
readArticle(dirEntry); readArticle(dirEntry);
@ -4032,7 +4043,8 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'util', 'cache', 'images', 'sett
document.getElementById('searchingArticles').style.display = 'none'; document.getElementById('searchingArticles').style.display = 'none';
$("#welcomeText").show(); $("#welcomeText").show();
} else { } else {
if (dirEntry.namespace === 'A') { // DEV: see comment above under goToRandomArticle()
if (dirEntry.redirect || dirEntry.getMimetype() === 'text/html' || dirEntry.namespace === 'A') {
params.isLandingPage = true; params.isLandingPage = true;
readArticle(dirEntry); readArticle(dirEntry);
} else { } else {

View File

@ -4,7 +4,7 @@ define([], function () {
* settingsStore.js * settingsStore.js
* *
* A reader/writer framework for cookies or localStorage with full unicode support based on the Mozilla cookies framework. * A reader/writer framework for cookies or localStorage with full unicode support based on the Mozilla cookies framework.
* The Mozilla code has been adapted to test for the availability of the localStorage API, and to use it in preference to settingsStore. * The Mozilla code has been adapted to test for the availability of the localStorage API, and to use it in preference to cookies.
* *
* Mozilla version information: * Mozilla version information:
* *

View File

@ -273,9 +273,9 @@ define(rqDef, function() {
* Derives the URL.pathname from a relative or semi-relative URL using the given base ZIM URL * Derives the URL.pathname from a relative or semi-relative URL using the given base ZIM URL
* *
* @param {String} url The (URI-encoded) URL to convert (e.g. "Einstein", "../Einstein", * @param {String} url The (URI-encoded) URL to convert (e.g. "Einstein", "../Einstein",
* "../../I/im%C3%A1gen.png", "-/s/style.css", "/A/Einstein.html") * "../../I/im%C3%A1gen.png", "-/s/style.css", "/A/Einstein.html", "../static/bootstrap/css/bootstrap.min.css")
* @param {String} base The base ZIM URL of the currently loaded article (e.g. "A/" or "A/subdir1/subdir2/") * @param {String} base The base ZIM URL of the currently loaded article (e.g. "A/", "A/subdir1/subdir2/", "C/Singapore/")
* @returns {String} The derived ZIM URL in decoded form (e.g. "A/Einstein", "I/imágen.png") * @returns {String} The derived ZIM URL in decoded form (e.g. "A/Einstein", "I/imágen.png", "C/")
*/ */
function deriveZimUrlFromRelativeUrl(url, base) { function deriveZimUrlFromRelativeUrl(url, base) {
// We use a dummy domain because URL API requires a valid URI // We use a dummy domain because URL API requires a valid URI

View File

@ -210,6 +210,30 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
searchNextVariant(); searchNextVariant();
}; };
/**
* A method to return the namespace in the ZIM file that contains the primary user content. In old-format ZIM files (minor
* version 0) there are a number of content namespaces, but the primary one in which to search for titles is 'A'. In new-format
* ZIMs (minor version 1) there is a single content namespace 'C'. See https://openzim.org/wiki/ZIM_file_format. This method
* throws an error if it cannot determine the namespace or if the ZIM is not ready.
* @returns {String} The content namespace for the ZIM archive
*/
ZIMArchive.prototype.getContentNamespace = function () {
var errorText;
if (this.isReady()) {
var ver = this._file.minorVersion;
// DEV: There are currently only two defined values for minorVersion in the OpenZIM specification
// If this changes, adapt the error checking and return values
if (ver > 1) {
errorText = 'Unknown ZIM minor version!';
} else {
return ver === 0 ? 'A' : 'C';
}
} else {
errorText = 'We could not determine the content namespace because the ZIM file is not ready!';
}
throw new Error(errorText);
};
/** /**
* Look for dirEntries with title starting with the given prefix (case-sensitive) * Look for dirEntries with title starting with the given prefix (case-sensitive)
* *
@ -226,12 +250,14 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
startIndex = startIndex || 0; startIndex = startIndex || 0;
prefix = prefix || ''; prefix = prefix || '';
var that = this; var that = this;
var cns = this.getContentNamespace();
util.binarySearch(startIndex, this._file.articleCount, function(i) { util.binarySearch(startIndex, this._file.articleCount, function(i) {
return that._file.dirEntryByTitleIndex(i).then(function(dirEntry) { return that._file.dirEntryByTitleIndex(i).then(function(dirEntry) {
if (search.status === 'cancelled') return 0; if (search.status === 'cancelled') return 0;
if (dirEntry.namespace < 'A') return 1; var ns = dirEntry.namespace;
if (dirEntry.namespace > 'A') return -1; if (ns < cns) return 1;
// We should now be in namespace A if (ns > cns) return -1;
// We should now be in namespace A (old format ZIM) or C (new format ZIM)
return prefix <= dirEntry.getTitleOrUrl() ? -1 : 1; return prefix <= dirEntry.getTitleOrUrl() ? -1 : 1;
}); });
}, true).then(function(firstIndex) { }, true).then(function(firstIndex) {
@ -246,7 +272,7 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
return that._file.dirEntryByTitleIndex(index).then(function(dirEntry) { return that._file.dirEntryByTitleIndex(index).then(function(dirEntry) {
var title = dirEntry.getTitleOrUrl(); var title = dirEntry.getTitleOrUrl();
// Only return dirEntries with titles that actually begin with prefix // Only return dirEntries with titles that actually begin with prefix
if (saveStartIndex === null || dirEntry.namespace === 'A' && title.indexOf(prefix) === 0) { if (saveStartIndex === null || dirEntry.namespace === cns && title.indexOf(prefix) === 0) {
dirEntries.push(dirEntry); dirEntries.push(dirEntry);
// Report interim result // Report interim result
if (typeof saveStartIndex === 'undefined') callback([dirEntry], index, true); if (typeof saveStartIndex === 'undefined') callback([dirEntry], index, true);
@ -327,9 +353,9 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
if (index === null) return null; if (index === null) return null;
return that._file.dirEntryByUrlIndex(index); return that._file.dirEntryByUrlIndex(index);
}).then(function(dirEntry) { }).then(function(dirEntry) {
if ((dirEntry === null || dirEntry === undefined) && /^A\/[^/]+\/.+/i.test(title)) { if ((dirEntry === null || dirEntry === undefined) && /^[AC]\/[^/]+\/.+/i.test(title)) {
console.log("Article " + title + " not available, but moving up one directory to compensate for ZIM coding error..."); console.log("Article " + title + " not available, but moving up one directory to compensate for ZIM coding error...");
title = title.replace(/^(A\/)[^/]+\/(.+)$/, '$1$2'); title = title.replace(/^([AC]\/)[^/]+\/(.+)$/, '$1$2');
return that.getDirEntryByTitle(title); return that.getDirEntryByTitle(title);
} else { } else {
return dirEntry; return dirEntry;

View File

@ -300,6 +300,9 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
zf.id = tempFileId++; zf.id = tempFileId++;
fileIDs.set(zf.name, zf.id); fileIDs.set(zf.name, zf.id);
} }
// For a description of these values, see https://wiki.openzim.org/wiki/ZIM_file_format
zf.majorVersion = readInt(header, 4, 2); // Not currently used by this implementation
zf.minorVersion = readInt(header, 6, 2); // Used to determine the User Content namespace
zf.articleCount = readInt(header, 24, 4); zf.articleCount = readInt(header, 24, 4);
zf.clusterCount = readInt(header, 28, 4); zf.clusterCount = readInt(header, 28, 4);
zf.urlPtrPos = urlPtrPos; zf.urlPtrPos = urlPtrPos;