Make code more efficient by defining regexes once

Former-commit-id: 9a068d12938d73ac7bbc31b3564da3e92411811f [formerly 3bcbe71717fa80b39f370057bd94901fedea4255 [formerly badd9f626aaddd9d4b32c773ac25e43e46d47c93]]
Former-commit-id: 33d076f9a1e8bd4b8b5a42bb5f93a09447c66985 [formerly 7b39a65d148b54fb801523d09be46504de948f2a]
Former-commit-id: 5b3c41f3c926176e0a2c206714116e2f1763ff2b
This commit is contained in:
Jaifroid 2022-06-01 06:17:36 +01:00
parent 88ea328cc9
commit 3946ff7162

View File

@ -72,6 +72,13 @@ define([], function () {
return dirEntry; return dirEntry;
} }
/**
* Establish some Regular Expressions used by the transformReplayUrls function
*/
var regexpZimitHtmlLinks = /(<(?:a|img|script|link|track|meta)\b[^>]*?[\s;])(?:src\b|href|url)\s*(=\s*(["']))(?=\/|https?:\/\/)((?:[^>](?!\3|\?|#))+[^>])([^>]*>)/ig;
var regexpZimitJavascriptLinks = /['"(]((?:https?:)?\/\/[^'"?#)]+)['"?#)]/ig;
var regexpZimitCssLinks = /\burl\s*\(['"\s]*([^)'"\s]+)['"\s]*\)/ig;
/** /**
* The main function for transforming Zimit URLs into standard ZIM URLs. * The main function for transforming Zimit URLs into standard ZIM URLs.
* @param {dirEntry} dirEntry The directory entry that points to the extracted data * @param {dirEntry} dirEntry The directory entry that points to the extracted data
@ -83,16 +90,16 @@ define([], function () {
function transformReplayUrls(dirEntry, data, mimetype, selectedArchive) { function transformReplayUrls(dirEntry, data, mimetype, selectedArchive) {
/** /**
* Transform URL links in HTML files * Transform URL links in HTML files
* Note that some Zimit ZIMs have mimteypes like 'text/html;raw=true', so we can't simply match 'text/html' * Note that some Zimit ZIMs have mimeteypes like 'text/html;raw=true', so we can't simply match 'text/html'
* Other ZIMs have mimetype like 'html' (with no 'text/'), so we have to match as generically as possible * Other ZIMs have a mimetype like 'html' (with no 'text/'), so we have to match as generically as possible
*/ */
if (/\bhtml\b/i.test(mimetype)) { // if (/\bhtml\b/i.test(mimetype)) { //
var zimitPrefix = data.match(/link\s+rel=["']canonical["']\s+href=(['"])https?:\/\/([^\/]+)(.+?)\1/i); var zimitPrefix = data.match(/link\s+rel=["']canonical["']\s+href=(['"])https?:\/\/([^\/]+)(.+?)\1/i);
zimitPrefix = zimitPrefix ? zimitPrefix[2] : params.zimitPrefix; zimitPrefix = zimitPrefix ? zimitPrefix[2] : params.zimitPrefix;
// Remove lazyimgage system and noscript tags that comment out images // Remove lazyimgage system and noscript tags that comment out images
// DEV: Check if this is still necessary
data = data.replace(/<noscript>\s*(<img\b[^>]+>)\s*<\/noscript>/ig, '$1'); data = data.replace(/<noscript>\s*(<img\b[^>]+>)\s*<\/noscript>/ig, '$1');
data = data.replace(/<span\b[^>]+lazy-image-placeholder[^<]+<\/span>\s*/ig, ''); data = data.replace(/<span\b[^>]+lazy-image-placeholder[^<]+<\/span>\s*/ig, '');
var regexpZimitHtmlLinks = /(<(?:a|img|script|link|track|meta)\b[^>]*?[\s;])(?:src\b|href|url)\s*(=\s*(["']))(?=\/|https?:\/\/)((?:[^>](?!\3|\?|#))+[^>])([^>]*>)/ig;
// Get stem for constructing an absolute URL // Get stem for constructing an absolute URL
var indexRoot = window.location.pathname.replace(/[^\/]+$/, '') + encodeURI(selectedArchive._file.name); var indexRoot = window.location.pathname.replace(/[^\/]+$/, '') + encodeURI(selectedArchive._file.name);
data = data.replace(regexpZimitHtmlLinks, function(match, blockStart, equals, quote, relAssetUrl, blockClose) { data = data.replace(regexpZimitHtmlLinks, function(match, blockStart, equals, quote, relAssetUrl, blockClose) {
@ -133,13 +140,12 @@ define([], function () {
return assetUrl; return assetUrl;
}); });
// Remove any <base href...> statements // Remove any <base href...> statements
// DEV: You should probably deal with this more intelligently, changing absolute links rather than just removing // DEV: You should probably deal with this more intelligently, changing absolute links rather than just removing,
// but so far, removing it seems to do the job // but so far, removing it seems to do the job
data = data.replace(/<base\b[^>]+href\b[^>]+>\s*/i, ''); data = data.replace(/<base\b[^>]+href\b[^>]+>\s*/i, '');
// Remove any residual analytics // Remove any residual analytics and ads
data = data.replace(/<script\b([^<]|<(?!\/script>))+?(?:google.*?analytics|adsbygoogle)([^<]|<(?!\/script>))+<\/script>\s*/i, ''); data = data.replace(/<script\b([^<]|<(?!\/script>))+?(?:google.*?analytics|adsbygoogle)([^<]|<(?!\/script>))+<\/script>\s*/i, '');
data = data.replace(/<ins\b(?:[^<]|<(?!\/ins>))+?adsbygoogle(?:[^<]|<(?!\/ins>))+<\/ins>\s*/ig, ''); data = data.replace(/<ins\b(?:[^<]|<(?!\/ins>))+?adsbygoogle(?:[^<]|<(?!\/ins>))+<\/ins>\s*/ig, '');
@ -170,7 +176,6 @@ define([], function () {
* Transform css-style links in stylesheet files and stylesheet blocks in HTML * Transform css-style links in stylesheet files and stylesheet blocks in HTML
*/ */
if (/\b(css|html)\b/i.test(mimetype)) { if (/\b(css|html)\b/i.test(mimetype)) {
var regexpZimitCssLinks = /\burl\s*\(['"\s]*([^)'"\s]+)['"\s]*\)/ig;
data = data.replace(regexpZimitCssLinks, function (match, url) { data = data.replace(regexpZimitCssLinks, function (match, url) {
var newBlock = match; var newBlock = match;
var assetUrl = url; var assetUrl = url;
@ -195,7 +200,6 @@ define([], function () {
* Transform links in JavaScript files or script blocks in the html * Transform links in JavaScript files or script blocks in the html
*/ */
if (/\b(javascript|html)\b/i.test(mimetype)) { if (/\b(javascript|html)\b/i.test(mimetype)) {
var regexpZimitJavascriptLinks = /['"(]((?:https?:)?\/\/[^'"?#)]+)['"?#)]/ig;
data = data.replace(regexpZimitJavascriptLinks, function (match, url) { data = data.replace(regexpZimitJavascriptLinks, function (match, url) {
var newBlock = match; var newBlock = match;
var assetUrl = url; var assetUrl = url;