Preliminary support for ZIMs with no namespace

Former-commit-id: 48b08080315e614bc83232b38f7ad4635cf49028 [formerly 25819217876aea980a41ce62ea1f526d6ec8a5ce [formerly d7f4153def08652e4bc75f9a142d96a94c23f2d4]] Former-commit-id: 529a316522cf28e2752955fbac95eccee4759440 Former-commit-id: f34f63004066c50c216d7376a1e3fc0a93ed5329
2025-09-09 04:06:27 -04:00 · 2021-02-06 18:03:03 +00:00 · 2021-02-06 18:03:03 +00:00 · 58eb005859
commit 58eb005859
parent a73395a0d6
6 changed files with 65 additions and 24 deletions
--- a/pwabuilder-sw.js
+++ b/pwabuilder-sw.js
@ -12,7 +12,7 @@ const regexpKiwixDownloadLinks = /download\.kiwix\.org/i;

 // Pattern for ZIM file namespace - see https://wiki.openzim.org/wiki/ZIM_file_format#Namespaces
 // In our case, there is also the ZIM file name, used as a prefix in the URL
-const regexpZIMUrlWithNamespace = /(?:^|\/)([^\/]+\/)([-ABIJMUVWX])\/(.+)/;
+const regexpZIMUrlWithNamespace = /(?:^|\/)([^\/]+\/)([-ABCIJMUVWX])\/(.+)/;

 const CACHE = "kiwix-precache-" + appVersion;
 const precacheFiles = [
--- a/www/js/app.js
+++ b/www/js/app.js
@ -3075,14 +3075,14 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'util', 'cache', 'images', 'sett
        // Pattern to find the path in a url
        var regexpPath = /^(.*\/)[^\/]+$/;
        // Pattern to find a ZIM URL (with its namespace) - see https://wiki.openzim.org/wiki/ZIM_file_format#Namespaces
-        var regexpZIMUrlWithNamespace = /^[.\/]*([-ABIJMUVWX]\/.+)$/;
-        // Regex below finds images, scripts, and stylesheets with ZIM-type metadata and image namespaces [kiwix-js #378]
-        // It first searches for <img, <script, <link, etc., then scans forward to find, on a word boundary, either src=["']
-        // or href=["'] (ignoring any extra whitespace), and it then tests the path of the URL with a non-capturing lookahead that
-        // matches ZIM URLs with namespaces [-IJ] ('-' = metadata or 'I'/'J' = image). When the regex is used below, it will also
-        // remove any relative or absolute path from ZIM-style URLs.
-        // DEV: If you want to support more namespaces, add them to the END of the character set [-IJ] (not to the beginning) 
-        var regexpTagsWithZimUrl = /(<(?:img|script|link)\b[^>]*?\s)(?:src|href)(\s*=\s*["'])(?:\.\.\/|\/)+(?=[-IJ]\/)/ig;
+    var regexpZIMUrlWithNamespace = /^[./]*([-ABCIJMUVWX]\/.+)$/;
+    // Regex below finds images, scripts, stylesheets and tracks with ZIM-type metadata and image namespaces [kiwix-js #378].
+    // It first searches for <img, <script, <link, etc., then scans forward to find, on a word boundary, either src=["'] or href=["']
+    // (ignoring any extra whitespace), and it then tests the path of the URL with a non-capturing negative lookahead that excludes
+    // URLs that begin 'http' (i.e. non-relative URLs). It then captures the whole of the URL up until either the opening delimiter
+    // (" or ', which is capture group \3) or a querystring or hash character (? or #). When the regex is used below, it will be further
+    // processed to calculate the ZIM URL from the relative path. This regex can cope with legitimate single quote marks (') in the URL.
+    var regexpTagsWithZimUrl = /(<(?:img|script|link|track)\b[^>]*?\s)(?:src|href)(\s*=\s*(["']))(?!http)(.+?)(?=\3|\?|#)/ig;
        // Regex below tests the html of an article for active content [kiwix-js #466]
        // It inspects every <script> block in the html and matches in the following cases: 1) the script loads a UI application called app.js;
        // 2) the script block has inline content that does not contain "importScript()", "toggleOpenSection" or an "articleId" assignment
@ -3148,8 +3148,17 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'util', 'cache', 'images', 'sett
            params.htmlArticle = htmlArticle;

            // Replaces ZIM-style URLs of img, script, link and media tags with a data-kiwixurl to prevent 404 errors [kiwix-js #272 #376]
-            // This replacement also processes the URL to remove the path so that the URL is ready for subsequent jQuery functions
-            if (params.contentInjectionMode == 'jquery') htmlArticle = htmlArticle.replace(regexpTagsWithZimUrl, '$1data-kiwixurl$2');
+        // This replacement also processes the URL relative to the page's ZIM URL so that we can find the ZIM URL of the asset
+        // with the correct namespace (this works for old-style -,I,J namespaces and for new-style C namespace)
+            if (params.contentInjectionMode == 'jquery') {
+                htmlArticle = htmlArticle.replace(regexpTagsWithZimUrl, function(match, blockStart, equals, quote, relAssetUrl) {
+                    var assetZIMUrl = uiUtil.deriveZimUrlFromRelativeUrl(relAssetUrl, baseUrl);
+                    // DEV: Note that deriveZimUrlFromRelativeUrl produces a *decoded* URL (and incidentally would remove any URI component
+                    // if we had captured it). We therefore re-encode the URI with encodeURI (which does not encode forward slashes) instead
+                    // of encodeURIComponent.
+                    return blockStart + 'data-kiwixurl' + equals + encodeURI(assetZIMUrl);
+                }
+            }
            // Remove any empty media containers on page
            htmlArticle = htmlArticle.replace(/(<(audio|video)\b(?:[^<]|<(?!\/\2))+<\/\2>)/ig, function (p0) {
                return /(?:src|data-kiwixurl)\s*=\s*["']/.test(p0) ? p0 : '';
@ -4010,8 +4019,10 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'util', 'cache', 'images', 'sett
                    document.getElementById('searchingArticles').style.display = 'none';
                    uiUtil.systemAlert("Error finding random article.");
                } else {
-                    //Test below supports Stackexchange-family ZIMs, so we don't call up user profiles
-                    if (dirEntry.namespace === 'A' && !/user\//.test(dirEntry.url)) {
+                // We fall back to the old A namespace to support old ZIM files without a text/html MIME type for articles
+                // DEV: This will need to be changed if we search titlePtrList version 1
+                // in a future PR, as that list contains only articles
+                if (dirEntry.getMimetype() === 'text/html' || dirEntry.namespace === 'A') {
                        params.isLandingPage = false;
                        $('#activeContent').hide();
                        readArticle(dirEntry);
@ -4032,7 +4043,8 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'util', 'cache', 'images', 'sett
                    document.getElementById('searchingArticles').style.display = 'none';
                    $("#welcomeText").show();
                } else {
-                    if (dirEntry.namespace === 'A') {
+                // DEV: see comment above under goToRandomArticle()
+                if (dirEntry.redirect || dirEntry.getMimetype() === 'text/html' || dirEntry.namespace === 'A') {
                        params.isLandingPage = true;
                        readArticle(dirEntry);
                    } else {
--- a/www/js/lib/settingsStore.js
+++ b/www/js/lib/settingsStore.js
@ -4,7 +4,7 @@ define([], function () {
   * settingsStore.js
   * 
   * A reader/writer framework for cookies or localStorage with full unicode support based on the Mozilla cookies framework.
-   * The Mozilla code has been adapted to test for the availability of the localStorage API, and to use it in preference to settingsStore.
+   * The Mozilla code has been adapted to test for the availability of the localStorage API, and to use it in preference to cookies.
   * 
   * Mozilla version information:
   * 
--- a/www/js/lib/uiUtil.js
+++ b/www/js/lib/uiUtil.js
@ -273,9 +273,9 @@ define(rqDef, function() {
     * Derives the URL.pathname from a relative or semi-relative URL using the given base ZIM URL
     * 
     * @param {String} url The (URI-encoded) URL to convert (e.g. "Einstein", "../Einstein",
-     *      "../../I/im%C3%A1gen.png", "-/s/style.css", "/A/Einstein.html")
-     * @param {String} base The base ZIM URL of the currently loaded article (e.g. "A/" or "A/subdir1/subdir2/")
-     * @returns {String} The derived ZIM URL in decoded form (e.g. "A/Einstein", "I/imágen.png")
+     *      "../../I/im%C3%A1gen.png", "-/s/style.css", "/A/Einstein.html", "../static/bootstrap/css/bootstrap.min.css")
+     * @param {String} base The base ZIM URL of the currently loaded article (e.g. "A/", "A/subdir1/subdir2/", "C/Singapore/")
+     * @returns {String} The derived ZIM URL in decoded form (e.g. "A/Einstein", "I/imÃ¡gen.png", "C/")
     */
    function deriveZimUrlFromRelativeUrl(url, base) {
        // We use a dummy domain because URL API requires a valid URI
--- a/www/js/lib/zimArchive.js
+++ b/www/js/lib/zimArchive.js
@ -210,6 +210,30 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
        searchNextVariant();
    };
    
+    /**
+     * A method to return the namespace in the ZIM file that contains the primary user content. In old-format ZIM files (minor
+     * version 0) there are a number of content namespaces, but the primary one in which to search for titles is 'A'. In new-format
+     * ZIMs (minor version 1) there is a single content namespace 'C'. See https://openzim.org/wiki/ZIM_file_format. This method
+     * throws an error if it cannot determine the namespace or if the ZIM is not ready.
+     * @returns {String} The content namespace for the ZIM archive 
+     */
+    ZIMArchive.prototype.getContentNamespace = function () {
+        var errorText;
+        if (this.isReady()) {
+            var ver = this._file.minorVersion;
+            // DEV: There are currently only two defined values for minorVersion in the OpenZIM specification
+            // If this changes, adapt the error checking and return values 
+            if (ver > 1) {
+                errorText = 'Unknown ZIM minor version!';
+            } else {
+                return ver === 0 ? 'A' : 'C';
+            }
+        } else {
+            errorText = 'We could not determine the content namespace because the ZIM file is not ready!';
+        }
+        throw new Error(errorText);
+    };
+    
    /**
     * Look for dirEntries with title starting with the given prefix (case-sensitive)
     * 
@ -226,12 +250,14 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
        startIndex = startIndex || 0;
        prefix = prefix || '';
        var that = this;
+        var cns = this.getContentNamespace();
        util.binarySearch(startIndex, this._file.articleCount, function(i) {
            return that._file.dirEntryByTitleIndex(i).then(function(dirEntry) {
                if (search.status === 'cancelled') return 0;
-                if (dirEntry.namespace < 'A') return 1;
-                if (dirEntry.namespace > 'A') return -1;
-                // We should now be in namespace A
+                var ns = dirEntry.namespace;
+                if (ns < cns) return 1;
+                if (ns > cns) return -1;
+                // We should now be in namespace A (old format ZIM) or C (new format ZIM)
                return prefix <= dirEntry.getTitleOrUrl() ? -1 : 1;
            });
        }, true).then(function(firstIndex) {
@ -246,7 +272,7 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
                return that._file.dirEntryByTitleIndex(index).then(function(dirEntry) {
                    var title = dirEntry.getTitleOrUrl();
                    // Only return dirEntries with titles that actually begin with prefix
-                    if (saveStartIndex === null || dirEntry.namespace === 'A' && title.indexOf(prefix) === 0) {
+                    if (saveStartIndex === null || dirEntry.namespace === cns && title.indexOf(prefix) === 0) {
                        dirEntries.push(dirEntry);
                        // Report interim result
                        if (typeof saveStartIndex === 'undefined') callback([dirEntry], index, true);
@ -327,9 +353,9 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
            if (index === null) return null;
            return that._file.dirEntryByUrlIndex(index);
        }).then(function(dirEntry) {
-            if ((dirEntry === null || dirEntry === undefined) && /^A\/[^/]+\/.+/i.test(title)) {
+            if ((dirEntry === null || dirEntry === undefined) && /^[AC]\/[^/]+\/.+/i.test(title)) {
                console.log("Article " + title + " not available, but moving up one directory to compensate for ZIM coding error...");
-                title = title.replace(/^(A\/)[^/]+\/(.+)$/, '$1$2');
+                title = title.replace(/^([AC]\/)[^/]+\/(.+)$/, '$1$2');
                return that.getDirEntryByTitle(title);
            } else {
                return dirEntry;
--- a/www/js/lib/zimfile.js
+++ b/www/js/lib/zimfile.js
@ -300,6 +300,9 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
                        zf.id = tempFileId++;
                        fileIDs.set(zf.name, zf.id);
                    }
+                    // For a description of these values, see https://wiki.openzim.org/wiki/ZIM_file_format
+                    zf.majorVersion = readInt(header, 4, 2); // Not currently used by this implementation
+                    zf.minorVersion = readInt(header, 6, 2); // Used to determine the User Content namespace
                    zf.articleCount = readInt(header, 24, 4);
                    zf.clusterCount = readInt(header, 28, 4);
                    zf.urlPtrPos = urlPtrPos;