Include fuzzy search for Zimit articles and title search (#379)

This commit is contained in:
Jaifroid 2023-03-08 20:53:16 +00:00 committed by GitHub
parent 886c7183bb
commit d62cea58c8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 110 additions and 17 deletions

View File

@ -1,5 +1,9 @@
# Changelog
## In-progress release 2.4.1
* ENHANCEMENT: Provide fuzzy search for case-insensitive links in Zimit archives
## Release 2.4.0
* FEATURE: Support Full Screen (all browsers) and rotation lock (primarily intended for mobile)

View File

@ -106,6 +106,7 @@
<div id="update" class="update">
<h3 style="margin-top:0;">Changes in version <span class="version">2.0</span></h3>
<ul style="padding-left: 15px;">
<li>Provide fuzzy search for case-insensitive links in Zimit archives</li>
<li>Support Full Screen (all browsers) and rotation lock (primarily intended for mobile)</li>
<li>Significant speed-up of access to Wikimedia archives with option to ignore unneeded JS files</li>
<li>Added sandbox attribute to iframe to block top-level navigation and attempts by scripts to "phone home"</li>

View File

@ -65,7 +65,7 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'util', 'utf8', 'cache', 'images
// Test caching capability
cache.test(function(){});
// Unique identifier of the article expected to be displayed
var expectedArticleURLToBeDisplayed = "";
appstate.expectedArticleURLToBeDisplayed = '';
// Check if we have managed to switch to PWA mode (if running UWP app)
// DEV: we do this in init.js, but sometimes it doesn't seem to register, so we do it again once the app has fully launched
if (/UWP\|PWA/.test(params.appType) && /^http/i.test(window.location.protocol)) {
@ -3992,9 +3992,9 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'util', 'utf8', 'cache', 'images
*/
function isDirEntryExpectedToBeDisplayed(dirEntry) {
var curArticleURL = dirEntry.namespace + "/" + dirEntry.url;
if (expectedArticleURLToBeDisplayed !== curArticleURL) {
if (appstate.expectedArticleURLToBeDisplayed !== curArticleURL) {
console.debug("url of current article :" + curArticleURL + ", does not match the expected url :" +
expectedArticleURLToBeDisplayed);
appstate.expectedArticleURLToBeDisplayed);
return false;
}
return true;
@ -4008,7 +4008,7 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'util', 'utf8', 'cache', 'images
// Reset search prefix to allow users to search the same string again if they want to
appstate.search.prefix = '';
// Only update for expectedArticleURLToBeDisplayed.
expectedArticleURLToBeDisplayed = dirEntry.namespace + "/" + dirEntry.url;
appstate.expectedArticleURLToBeDisplayed = dirEntry.namespace + '/' + dirEntry.url;
params.pagesLoaded++;
if (dirEntry.isRedirect()) {
appstate.selectedArchive.resolveRedirect(dirEntry, readArticle);
@ -5704,6 +5704,7 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'util', 'utf8', 'cache', 'images
* @param {String} pathEnc The fully encoded version of the path for use with some Zimit archives
*/
function goToArticle(path, download, contentType, pathEnc) {
appstate.expectedArticleURLToBeDisplayed = path;
//This removes any search highlighting
clearFindInArticle();
var shortTitle = path.replace(/[^/]+\//g, '').substring(0, 18);

View File

@ -172,7 +172,7 @@ define(rqDef, function(util) {
}
}
function pollSpinner(msg) {
function pollSpinner(msg, noTimeout) {
msg = msg || '';
document.getElementById('searchingArticles').style.display = 'block';
var cachingAssets = document.getElementById('cachingAssets');
@ -181,7 +181,7 @@ define(rqDef, function(util) {
else cachingAssets.style.display = 'none';
// Never allow spinner to run for more than 3s
clearTimeout(clearSpinner);
setTimeout(clearSpinner, 3000);
if (!noTimeout) setTimeout(clearSpinner, 3000);
}
function clearSpinner() {

View File

@ -290,7 +290,7 @@ define(['zimfile', 'zimDirEntry', 'transformZimit', 'util', 'uiUtil', 'utf8'],
var rgxSplitPrefix = /^[-ABCHIJMUVWX]\//;
if (that._file.zimType === 'zimit' && cns === 'C') {
// We have to account for the Zimit prefix in Type 1 ZIMs
rgxSplitPrefix = /^[CMWX]\/(?:[AH]\/)?/;
rgxSplitPrefix = /^(?:[CMWX]\/)?(?:[AH]\/)?/;
}
var splitPrefix = prefix.match(rgxSplitPrefix);
prefixNameSpaces = splitPrefix ? splitPrefix[0] : '';
@ -305,6 +305,10 @@ define(['zimfile', 'zimDirEntry', 'transformZimit', 'util', 'uiUtil', 'utf8'],
startArray.push(prefix.replace(/^./, function (m) {
return m.toLocaleUpperCase();
}));
// Add pure lowercase string (rarer)
startArray.push(prefix);
// Add a case-insensitive search for the string (pseudo-regex notation)
startArray.push('/' + prefix + '/i');
// Get the full array of combinations to check number of combinations
var fullCombos = util.removeDuplicateStringsInSmallArray(util.allCaseFirstLetters(prefix, 'full'));
// Put cap on exponential number of combinations (five words = 3^5 = 243 combinations)
@ -343,7 +347,14 @@ define(['zimfile', 'zimDirEntry', 'transformZimit', 'util', 'uiUtil', 'utf8'],
if (!noInterim) callback(dirEntries, search);
search.found = dirEntries.length;
var prefix = prefixNameSpaces + prefixVariants[0];
// console.debug('Searching for: ' + prefixVariants[0]);
search.lc = false;
// If it's pseudo-regex with a case-insensitive flag like '/my search/i', do an enhanced case-insensitive search
if (/^\/.+\/i$/.test(prefixVariants[0])) {
search.lc = true;
prefix = prefixNameSpaces + prefixVariants[0].replace(/^\/(.+)\/i/, '$1').toLocaleLowerCase();
console.debug('Searching case-insensitively for: "' + prefix + '"');
}
// Remove in-progress search variant from array
prefixVariants = prefixVariants.slice(1);
// Search window sets an upper limit on how many matching dirEntries will be scanned in a full index search
search.window = search.rgxPrefix ? 10000 * search.size : search.size;
@ -352,8 +363,18 @@ define(['zimfile', 'zimDirEntry', 'transformZimit', 'util', 'uiUtil', 'utf8'],
search.countReport = countReport;
if (search.status === 'cancelled') return callback([], search);
if (!noInterim && countReport === true) return callback(dirEntries, search);
if (interim) {// Only push interim results (else results will be pushed again at end of variant loop)
[].push.apply(dirEntries, newDirEntries);
// Only push interim results to the dirEntries array (otherwise we get a duplicated array when the final results are reported to this function)
if (interim) {
// Collect all the found paths for the dirEntries so far
var dirEntryPaths = [];
for (var i = 0; i < dirEntries.length; i++) {
dirEntryPaths.push(dirEntries[i].url);
}
// Push new directory entries to the end of the global array so long as they are not duplicates
for (var j = 0; j < newDirEntries.length; j++) {
if (~dirEntryPaths.indexOf(newDirEntries[j].url)) continue;
dirEntries.push(newDirEntries[j]);
}
search.found = dirEntries.length;
if (!noInterim && newDirEntries.length) return callback(dirEntries, search);
} else return searchNextVariant();
@ -420,9 +441,18 @@ define(['zimfile', 'zimDirEntry', 'transformZimit', 'util', 'uiUtil', 'utf8'],
if (ns < cns) return 1;
if (ns > cns) return -1;
// We should now be in namespace A (old format ZIM) or C (new format ZIM)
return prefix <= dirEntry.getTitleOrUrl() ? -1 : 1;
if (search.lc) { // Search comparator should be lowercase (for case-insensitive search)
ti = ti.toLocaleLowerCase();
prefix = prefix.toLocaleLowerCase();
}
return prefix <= ti ? -1 : 1;
} else {
return prefix <= ns + '/' + ti ? -1 : 1;
if (search.lc) { // Search comparator should be lowercase (for case-insensitive search)
ns = ns + '/' + ti.replace(/^((?:[AH])?)\/?.*/, '$1');
ti = ti.replace(/^[AH]\//, '').toLocaleLowerCase();
}
// if (search.rgxPrefix && search.rgxPrefix.test(ti)) return -1;
return prefix <= (ns + '/' + ti) ? -1 : 1;
}
});
}, true).then(function(firstIndex) {
@ -445,9 +475,13 @@ define(['zimfile', 'zimDirEntry', 'transformZimit', 'util', 'uiUtil', 'utf8'],
var title = dirEntry.getTitleOrUrl();
// If we are searching by URL, display namespace also
if (search.searchUrlIndex) title = dirEntry.namespace + '/' + dirEntry.url;
if (search.lc && !search.rgxPrefix) { // Search comparator should be lowercase if not using regex (for case-insensitive search)
var ns = title.replace(/^((?:C\/)?(?:[AH]\/)?).*/, '$1');
title = ns + title.replace(ns, '').toLocaleLowerCase();
}
// Only return dirEntries with titles that actually begin with prefix
if (saveStartIndex === null || (search.searchUrlIndex || dirEntry.namespace === cns) && title.indexOf(prefix) === 0) {
if (!search.rgxPrefix || search.rgxPrefix && search.rgxPrefix.test(title.replace(prefix, ''))) {
if (!search.rgxPrefix || search.rgxPrefix && search.rgxPrefix.test(title)) { // Regex test case-insensitive if i flag set
vDirEntries.push(dirEntry);
// Report interim result
if (typeof saveStartIndex === 'undefined') callback([dirEntry], false, true);
@ -610,6 +644,9 @@ define(['zimfile', 'zimDirEntry', 'transformZimit', 'util', 'uiUtil', 'utf8'],
}
callback(dirEntry, data);
}
}).catch(function (e) {
console.error('Error reading directory entry', e);
callback(dirEntry, '');
});
};
@ -681,11 +718,30 @@ define(['zimfile', 'zimDirEntry', 'transformZimit', 'util', 'uiUtil', 'utf8'],
path = path.replace(/^A\//, 'H/').replace(/^(C\/)A\//, '$1H/');
console.debug('DirEntry ' + oldPath + ' not found, looking up header: ' + path);
return that.getDirEntryByPath(path, true, oldPath);
// } else if (zimitResolving) {
} else if (zimitResolving && appstate.originalPath && appstate.originalPath === appstate.expectedArticleURLToBeDisplayed) {
// We couldn't find the Header, so try a fuzzy search only if the user is loading an article
path = appstate.originalPath;
var ns = path.replace(/^((?:C\/)?A\/).*/, '$1'); // If Zimit pseudo-namespaces are changed, will need to edit this
path = path.replace(ns, '');
path = path.toLocaleLowerCase(); // We are going to combine case-insensitive string comparison with regex matching
var rgxPath = path.replace(/([-/?.$^|*+()[{])/g, '\\$1'); // Make sure we escape regex characters
path = ns + path; // Add namespace back to path for full matching
// path = ns;
var search = {
rgxPrefix: new RegExp('.*' + rgxPath, 'i'),
searchUrlIndex: true,
lc: true, // Make the comparator (e.g. dirEntry.url) lowercase
size: 1,
found: 0
}
return fuzzySearch(path, search);
} else {
var newpath = path.replace(/^((?:A|C\/A)\/)[^/]+\/(.+)$/, '$1$2');
if (newpath === path) return null; // No further paths to explore!
console.log("Article " + path + " not available, but moving up one directory to compensate for ZIM coding error...");
return that.getDirEntryByPath(newpath);
}
var newpath = path.replace(/^((?:A|C\/A)\/)[^/]+\/(.+)$/, '$1$2');
if (newpath === path) return null; // No further paths to explore!
console.log("Article " + path + " not available, but moving up one directory to compensate for ZIM coding error...");
return that.getDirEntryByPath(newpath);
} else {
// DEBUG: List found Directory Entry
// if (dirEntry) console.debug('Found ' + path);
@ -694,6 +750,37 @@ define(['zimfile', 'zimDirEntry', 'transformZimit', 'util', 'uiUtil', 'utf8'],
});
};
/**
* Initiate a fuzzy search for dirEntries matching the search object
* @param {String} path Human-readable path to search for
* @param {Object} search The search object
* @returns {Promise<DirEntry>} A Promise that resolves to a Directory Entry, or null if not found
*/
function fuzzySearch(path, search) {
return new Promise(function (resolve, reject) {
console.log('Initiating fuzzy search for ' + path + '...');
uiUtil.pollSpinner('Fuzzy search for ' + path + '...', true);
var searchResolved = false;
// setTimeout(function () {
// if (!searchResolved) uiUtil.pollSpinner('Fuzzy search for ' + path + '...', true);
// }, 5000);
appstate.selectedArchive.findDirEntriesWithPrefixCaseSensitive(path, search, function (dirEntry) {
if (!search.found && dirEntry && dirEntry[0] && dirEntry[0].url) {
search.found++;
dirEntry = dirEntry[0];
dirEntry = transformZimit.filterReplayFiles(dirEntry);
if (dirEntry) console.debug('Found ' + dirEntry.url + ' in fuzzy search');
searchResolved = true;
resolve(dirEntry);
} else {
console.debug('No fuzzy search results found');
searchResolved = true;
resolve(null);
}
}, null);
});
}
/**
*
* @param {callbackDirEntry} callback