Repository URL to install this package:
Version:
3.1.7 ▾
|
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.htmlDocIsImageOnly = exports.removeWrappingParagraphAndTrailingEmptyElements = exports.extractHtmlBody = exports.isSelfClosingTag = exports.attributesHtml = void 0;
const Entities = require('html-entities').AllHtmlEntities;
const htmlentities = new Entities().encode;
const url_1 = require("@joplin/utils/url");
const htmlparser2 = require('@joplin/fork-htmlparser2');
// [\s\S] instead of . for multiline matching
// https://stackoverflow.com/a/16119722/561309
const imageRegex = /<img([\s\S]*?)src=["']([\s\S]*?)["']([\s\S]*?)>/gi;
const anchorRegex = /<a([\s\S]*?)href=["']([\s\S]*?)["']([\s\S]*?)>/gi;
const selfClosingElements = [
'area',
'base',
'basefont',
'br',
'col',
'command',
'embed',
'frame',
'hr',
'img',
'input',
'isindex',
'keygen',
'link',
'meta',
'param',
'source',
'track',
'wbr',
];
const attributesHtml = (attr) => {
const output = [];
for (const n in attr) {
if (!attr.hasOwnProperty(n))
continue;
if (!attr[n]) {
output.push(n);
}
else {
output.push(`${n}="${htmlentities(attr[n])}"`);
}
}
return output.join(' ');
};
exports.attributesHtml = attributesHtml;
const isSelfClosingTag = (tagName) => {
return selfClosingElements.includes(tagName.toLowerCase());
};
exports.isSelfClosingTag = isSelfClosingTag;
class HtmlUtils {
// eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied
processImageTags(html, callback) {
if (!html)
return '';
return html.replace(imageRegex, (_v, before, src, after) => {
const action = callback({ src, before, after });
if (!action)
return `<img${before}src="${src}"${after}>`;
if (action.type === 'replaceElement') {
return action.html;
}
if (action.type === 'replaceSource') {
return `<img${before}src="${action.src}"${after}>`;
}
if (action.type === 'setAttributes') {
const attrHtml = (0, exports.attributesHtml)(action.attrs);
return `<img${before}${attrHtml}${after}>`;
}
throw new Error(`Invalid action: ${action.type}`);
});
}
// eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied
processAnchorTags(html, callback) {
if (!html)
return '';
return html.replace(anchorRegex, (_v, before, href, after) => {
const action = callback({ href: href });
if (!action)
return `<a${before}href="${href}"${after}>`;
if (action.type === 'replaceElement') {
return action.html;
}
if (action.type === 'replaceSource') {
return `<img${before}href="${action.href}"${after}>`;
}
if (action.type === 'setAttributes') {
const attrHtml = (0, exports.attributesHtml)(action.attrs);
return `<img${before}${attrHtml}${after}>`;
}
throw new Error(`Invalid action: ${action.type}`);
});
}
stripHtml(html) {
const output = [];
const tagStack = [];
const currentTag = () => {
if (!tagStack.length)
return '';
return tagStack[tagStack.length - 1];
};
const disallowedTags = ['script', 'style', 'head', 'iframe', 'frameset', 'frame', 'object', 'base'];
const parser = new htmlparser2.Parser({
onopentag: (name) => {
tagStack.push(name.toLowerCase());
},
ontext: (decodedText) => {
if (disallowedTags.includes(currentTag()))
return;
output.push(decodedText);
},
onclosetag: (name) => {
if (currentTag() === name.toLowerCase())
tagStack.pop();
},
}, { decodeEntities: true });
parser.write(html);
parser.end();
// In general, we want to get back plain text from this function, so all
// HTML entities are decoded. Howver, to prevent XSS attacks, we
// re-encode all the "<" characters, which should break any attempt to
// inject HTML tags.
return output.join('')
.replace(/\s+/g, ' ')
.replace(/</g, '<');
}
// This is tested in sanitize_links.md
isAcceptedUrl(url, allowedFilePrefixes) {
url = url.toLowerCase();
if (url.startsWith('https://') ||
url.startsWith('http://') ||
url.startsWith('mailto:') ||
url.startsWith('joplin://') ||
!!url.match(/:\/[0-9a-zA-Z]{32}/) ||
// We also allow anchors but only with a specific set of a characters.
// Fixes https://github.com/laurent22/joplin/issues/8286
!!url.match(/^#[a-zA-Z0-9-]+$/))
return true;
if (url.startsWith('file://')) {
// We need to do a case insensitive comparison because the URL we
// get appears to be converted to lowercase somewhere. To be
// completely sure, we make it lowercase explicitly.
const filePath = (0, url_1.fileUriToPath)(url).toLowerCase();
for (const filePrefix of allowedFilePrefixes) {
if (filePath.startsWith(filePrefix.toLowerCase()))
return true;
}
}
return false;
}
sanitizeHtml(html, options = null) {
var _a;
options = Object.assign({
// If true, adds a "jop-noMdConv" class to all the tags.
// It can be used afterwards to restore HTML tags in Markdown.
addNoMdConvClass: false }, options);
// If options.allowedFilePrefixes is `undefined`, default to [].
(_a = options.allowedFilePrefixes) !== null && _a !== void 0 ? _a : (options.allowedFilePrefixes = []);
const output = [];
const tagStack = [];
const currentTag = () => {
if (!tagStack.length)
return '';
return tagStack[tagStack.length - 1];
};
// When we encounter a disallowed tag, all the other tags within it are
// going to be skipped too. This is necessary to prevent certain XSS
// attacks. See sanitize_11.md
let disallowedTagDepth = 0;
// The BASE tag allows changing the base URL from which files are
// loaded, and that can break several plugins, such as Katex (which
// needs to load CSS files using a relative URL). For that reason it is
// disabled. More info: https://github.com/laurent22/joplin/issues/3021
//
// "link" can be used to escape the parser and inject JavaScript. Adding
// "meta" too for the same reason as it shouldn't be used in notes
// anyway.
//
// There are too many issues with SVG tags and to handle them properly
// we should parse them separately. Currently we are not so it is better
// to disable them. SVG graphics are still supported via the IMG tag.
const disallowedTags = [
'script', 'iframe', 'frameset', 'frame', 'object', 'base',
'embed', 'link', 'meta', 'noscript', 'button',
'input', 'select', 'textarea', 'option', 'optgroup',
'svg',
// Disallow map and area tags: <area ...> links are currently not
// sanitized as well as <a ...> links, allowing potential sandbox
// escape.
'map', 'area',
];
// Certain tags should not be rendered, however unlike for the disallowed tags, we want to
// keep their content. For example the FORM tag may sometimes wrap relevant content so we
// want to keep that content, but we don't want to keep the FORM tag itself. In that case we
// simply replace it with a DIV tag.
const replaceWithDivTags = [
'form',
];
const parser = new htmlparser2.Parser({
onopentag: (name, attrs) => {
// Note: "name" and attribute names are always lowercase even
// when the input is not. So there is no need to call
// "toLowerCase" on them.
tagStack.push(name);
if (disallowedTags.includes(currentTag())) {
disallowedTagDepth++;
return;
}
if (disallowedTagDepth)
return;
if (replaceWithDivTags.includes(currentTag())) {
output.push('<div>');
return;
}
attrs = Object.assign({}, attrs);
// Remove all the attributes that start with "on", which
// normally should be JavaScript events. A better solution
// would be to blacklist known events only but it seems the
// list is not well defined [0] and we don't want any to slip
// through the cracks. A side effect of this change is a
// regular harmless attribute that starts with "on" will also
// be removed.
// 0: https://developer.mozilla.org/en-US/docs/Web/Events
for (const attrName in attrs) {
if (!attrs.hasOwnProperty(attrName))
continue;
if (attrName.length <= 2)
continue;
if (attrName.substr(0, 2) !== 'on')
continue;
delete attrs[attrName];
}
// Make sure that only non-acceptable URLs are filtered out. In
// particular we want to exclude `javascript:` URLs. This
// applies to A tags, and also AREA ones but to be safe we don't
// filter on the tag name and process all HREF attributes.
if ('href' in attrs && !this.isAcceptedUrl(attrs['href'], options.allowedFilePrefixes)) {
attrs['href'] = '#';
}
// We need to clear any such attribute, otherwise it will
// make any arbitrary link open within the application.
if ('data-from-md' in attrs) {
delete attrs['data-from-md'];
}
if (options.addNoMdConvClass) {
let classAttr = attrs['class'] || '';
if (!classAttr.includes('jop-noMdConv')) {
classAttr += ' jop-noMdConv';
attrs['class'] = classAttr.trim();
}
}
// For some reason, entire parts of HTML notes don't show up in
// the viewer when there's an anchor tag without an "href"
// attribute. It doesn't always happen and it seems to depend on
// what else is in the note but in any case adding the "href"
// fixes it. https://github.com/laurent22/joplin/issues/5687
if (name === 'a' && !attrs['href']) {
attrs['href'] = '#';
}
let attrHtml = (0, exports.attributesHtml)(attrs);
if (attrHtml)
attrHtml = ` ${attrHtml}`;
const closingSign = (0, exports.isSelfClosingTag)(name) ? '/>' : '>';
output.push(`<${name}${attrHtml}${closingSign}`);
},
ontext: (decodedText) => {
if (disallowedTagDepth)
return;
if (currentTag() === 'style') {
// For CSS, we have to put the style as-is inside the tag
// because if we html-entities encode it, it's not going to
// work. But it's ok because JavaScript won't run within the
// style tag. Ideally CSS should be loaded from an external
// file.
// We however have to encode at least the `<` characters to
// prevent certain XSS injections that would rely on the
// content not being encoded (see sanitize_13.md)
output.push(decodedText.replace(/</g, '<'));
}
else {
output.push(htmlentities(decodedText));
}
},
onclosetag: (name) => {
const current = currentTag();
if (current === name.toLowerCase())
tagStack.pop();
// The Markdown sanitization code can result in calls like this:
// sanitizeHtml('<invlaid>')
// sanitizeHtml('</invalid>')
// Thus, we need to be able to remove '</invalid>', even if there is no
// corresponding opening tag.
if (disallowedTags.includes(current) || disallowedTags.includes(name)) {
if (disallowedTagDepth > 0) {
disallowedTagDepth--;
}
return;
}
if (disallowedTagDepth)
return;
if (replaceWithDivTags.includes(currentTag())) {
output.push('</div>');
return;
}
if ((0, exports.isSelfClosingTag)(name))
return;
output.push(`</${name}>`);
},
}, { decodeEntities: true });
parser.write(html);
parser.end();
return output.join('');
}
}
const makeHtmlTag = (name, attrs) => {
let attrHtml = (0, exports.attributesHtml)(attrs);
if (attrHtml)
attrHtml = ` ${attrHtml}`;
const closingSign = (0, exports.isSelfClosingTag)(name) ? '/>' : '>';
return `<${name}${attrHtml}${closingSign}`;
};
// Will return either the content of the <BODY> tag if it exists, or the whole
// HTML (which would be a fragment of HTML)
const extractHtmlBody = (html) => {
let inBody = false;
let bodyFound = false;
const output = [];
const parser = new htmlparser2.Parser({
onopentag: (name, attrs) => {
if (name === 'body') {
inBody = true;
bodyFound = true;
return;
}
if (inBody) {
output.push(makeHtmlTag(name, attrs));
}
},
ontext: (encodedText) => {
if (inBody)
output.push(encodedText);
},
onclosetag: (name) => {
if (inBody && name === 'body')
inBody = false;
if (inBody) {
if ((0, exports.isSelfClosingTag)(name))
return;
output.push(`</${name}>`);
}
},
}, { decodeEntities: false });
parser.write(html);
parser.end();
return bodyFound ? output.join('') : html;
};
exports.extractHtmlBody = extractHtmlBody;
const removeWrappingParagraphAndTrailingEmptyElements = (html) => {
if (!html.startsWith('<p>'))
return html;
const stack = [];
const output = [];
let inFirstParagraph = true;
let canSimplify = true;
const parser = new htmlparser2.Parser({
onopentag: (name, attrs) => {
if (inFirstParagraph && stack.length > 0) {
output.push(makeHtmlTag(name, attrs));
}
else if (!inFirstParagraph && attrs.style) {
canSimplify = false;
}
stack.push(name);
},
ontext: (encodedText) => {
if (encodedText.trim() && !inFirstParagraph) {
canSimplify = false;
}
else {
output.push(encodedText);
}
},
onclosetag: (name) => {
stack.pop();
if (stack.length === 0 && name === 'p') {
inFirstParagraph = false;
}
else if (inFirstParagraph) {
if ((0, exports.isSelfClosingTag)(name))
return;
output.push(`</${name}>`);
// Many elements, even if empty, can still be visible.
// For example, an <hr/>. Don't simplify if these elements
// are present.
}
else if (!['div', 'style', 'span'].includes(name)) {
canSimplify = false;
}
},
});
parser.write(html);
parser.end();
return canSimplify ? output.join('') : html;
};
exports.removeWrappingParagraphAndTrailingEmptyElements = removeWrappingParagraphAndTrailingEmptyElements;
const htmlDocIsImageOnly = (html) => {
let imageCount = 0;
let nonImageFound = false;
let textFound = false;
// Ignore these tags that do not result in any Markdown (or HTML) code being generated.
const ignoredTags = ['meta', 'head', 'body', 'html'];
const parser = new htmlparser2.Parser({
onopentag: (name) => {
if (name === 'img') {
imageCount++;
}
else if (ignoredTags.includes(name)) {
// Skip
}
else {
nonImageFound = true;
}
},
ontext: (text) => {
if (text.trim())
textFound = true;
},
});
parser.write(html);
parser.end();
return imageCount === 1 && !nonImageFound && !textFound;
};
exports.htmlDocIsImageOnly = htmlDocIsImageOnly;
exports.default = new HtmlUtils();
//# sourceMappingURL=htmlUtils.js.map