···
2
-
* HTML sanitizer inspired by Mastodon's Sanitize::Config
3
-
* Sanitizes HTML content to prevent XSS while preserving safe formatting
6
-
const HTTP_PROTOCOLS = ['http', 'https']
8
-
const LINK_PROTOCOLS = [
22
-
const PROTOCOL_REGEX = /^([a-z][a-z0-9.+-]*):\/\//i
24
-
interface SanitizeOptions {
25
-
allowOembed?: boolean
29
-
* Sanitizes HTML content following Mastodon's strict rules
31
-
export function sanitizeHtml(
33
-
options: SanitizeOptions = {},
35
-
if (typeof DOMParser === 'undefined') {
36
-
// Fallback for environments without DOMParser
37
-
return sanitizeTextOnly(html)
40
-
const parser = new DOMParser()
41
-
const doc = parser.parseFromString(html, 'text/html')
42
-
const body = doc.body
44
-
sanitizeNode(body, options)
46
-
return body.innerHTML
49
-
function sanitizeNode(node: Node, options: SanitizeOptions): void {
50
-
const childNodes = Array.from(node.childNodes)
52
-
for (const child of childNodes) {
53
-
if (child.nodeType === Node.ELEMENT_NODE) {
54
-
const element = child as HTMLElement
55
-
const tagName = element.tagName.toLowerCase()
57
-
// Define allowed elements
58
-
const allowedElements = options.allowOembed
108
-
// Handle unsupported elements (h1-h6) - convert to <strong> wrapped in <p>
109
-
if (['h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(tagName)) {
110
-
const strong = element.ownerDocument!.createElement('strong')
111
-
while (element.firstChild) {
112
-
strong.appendChild(element.firstChild)
114
-
const p = element.ownerDocument!.createElement('p')
115
-
p.appendChild(strong)
116
-
element.replaceWith(p)
117
-
sanitizeNode(p, options)
121
-
// Handle math elements - extract annotation text
122
-
if (tagName === 'math') {
123
-
const mathText = extractMathAnnotation(element)
125
-
const textNode = element.ownerDocument!.createTextNode(mathText)
126
-
element.replaceWith(textNode)
133
-
if (tagName === 'li') {
134
-
// Keep li elements but sanitize their children
135
-
sanitizeNode(element, options)
139
-
// Remove elements not in allowlist
140
-
if (!allowedElements.includes(tagName)) {
141
-
// Replace with text content
142
-
const textNode = element.ownerDocument!.createTextNode(
143
-
element.textContent || '',
145
-
element.replaceWith(textNode)
149
-
// Sanitize attributes
150
-
sanitizeAttributes(element, options)
152
-
// Recursively sanitize children
153
-
sanitizeNode(element, options)
158
-
function sanitizeAttributes(
159
-
element: HTMLElement,
160
-
options: SanitizeOptions,
162
-
const tagName = element.tagName.toLowerCase()
163
-
const allowedAttrs: Record<string, string[]> = {
164
-
a: ['href', 'rel', 'class', 'translate'],
165
-
span: ['class', 'translate'],
166
-
ol: ['start', 'reversed'],
171
-
if (options.allowOembed) {
172
-
allowedAttrs.audio = ['controls']
173
-
allowedAttrs.iframe = [
181
-
allowedAttrs.source = ['src', 'type']
182
-
allowedAttrs.video = ['controls', 'height', 'loop', 'width']
185
-
const allowed = allowedAttrs[tagName] || []
186
-
const attrs = Array.from(element.attributes)
188
-
// Remove non-allowed attributes
189
-
for (const attr of attrs) {
190
-
const attrName = attr.name.toLowerCase()
191
-
const isAllowed = allowed.some(a => {
192
-
if (a.endsWith('*')) {
193
-
return attrName.startsWith(a.slice(0, -1))
195
-
return a === attrName
199
-
element.removeAttribute(attr.name)
203
-
// Process specific attributes
204
-
if (tagName === 'a') {
205
-
processAnchorElement(element)
208
-
// Process class whitelist
209
-
if (element.hasAttribute('class')) {
210
-
processClassWhitelist(element)
213
-
// Process translate attribute - remove unless it's "no"
214
-
if (element.hasAttribute('translate')) {
215
-
const translate = element.getAttribute('translate')
216
-
if (translate !== 'no') {
217
-
element.removeAttribute('translate')
221
-
// Validate protocols for elements with src/href
222
-
if (element.hasAttribute('href') || element.hasAttribute('src')) {
223
-
validateProtocols(element, options)
227
-
function processAnchorElement(element: HTMLElement): void {
228
-
// Add required attributes
229
-
element.setAttribute('rel', 'nofollow noopener')
230
-
element.setAttribute('target', '_blank')
232
-
// Check if href has unsupported protocol
233
-
const href = element.getAttribute('href')
235
-
const scheme = getScheme(href)
236
-
if (scheme !== null && scheme !== 'relative' && !LINK_PROTOCOLS.includes(scheme)) {
237
-
// Replace element with its text content
238
-
const textNode = element.ownerDocument!.createTextNode(
239
-
element.textContent || '',
241
-
element.replaceWith(textNode)
246
-
function processClassWhitelist(element: HTMLElement): void {
247
-
const classList = element.className.split(/[\t\n\f\r ]+/).filter(Boolean)
248
-
const whitelisted = classList.filter(className => {
249
-
// microformats classes
250
-
if (/^[hpuedt]-/.test(className)) return true
251
-
// semantic classes
252
-
if (/^(mention|hashtag)$/.test(className)) return true
253
-
// link formatting classes
254
-
if (/^(ellipsis|invisible)$/.test(className)) return true
255
-
// quote inline class
256
-
if (className === 'quote-inline') return true
260
-
if (whitelisted.length > 0) {
261
-
element.className = whitelisted.join(' ')
263
-
element.removeAttribute('class')
267
-
function validateProtocols(
268
-
element: HTMLElement,
269
-
options: SanitizeOptions,
271
-
const tagName = element.tagName.toLowerCase()
272
-
const src = element.getAttribute('src')
273
-
const href = element.getAttribute('href')
274
-
const url = src || href
278
-
const scheme = getScheme(url)
280
-
// For oembed elements, only allow HTTP protocols for src
282
-
options.allowOembed &&
284
-
['iframe', 'source'].includes(tagName)
286
-
if (scheme !== null && !HTTP_PROTOCOLS.includes(scheme)) {
287
-
element.removeAttribute('src')
289
-
// Add sandbox attribute to iframes
290
-
if (tagName === 'iframe') {
291
-
element.setAttribute(
293
-
'allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox allow-forms',
299
-
function getScheme(url: string): string | null {
300
-
const match = url.match(PROTOCOL_REGEX)
302
-
return match[1].toLowerCase()
304
-
// Check if it's a relative URL
305
-
if (url.startsWith('/') || url.startsWith('.')) {
312
-
* Extract math annotation from MathML element
313
-
* Follows FEP-dc88 spec for math element representation
315
-
function extractMathAnnotation(mathElement: HTMLElement): string | null {
316
-
const semantics = Array.from(mathElement.children).find(
317
-
child => child.tagName.toLowerCase() === 'semantics',
318
-
) as HTMLElement | undefined
320
-
if (!semantics) return null
322
-
// Look for LaTeX annotation (application/x-tex)
323
-
const latexAnnotation = Array.from(semantics.children).find(child => {
325
-
child.tagName.toLowerCase() === 'annotation' &&
326
-
child.getAttribute('encoding') === 'application/x-tex'
330
-
if (latexAnnotation) {
331
-
const display = mathElement.getAttribute('display')
332
-
const text = latexAnnotation.textContent || ''
333
-
return display === 'block' ? `$$${text}$$` : `$${text}$`
336
-
// Look for plain text annotation
337
-
const plainAnnotation = Array.from(semantics.children).find(child => {
339
-
child.tagName.toLowerCase() === 'annotation' &&
340
-
child.getAttribute('encoding') === 'text/plain'
344
-
if (plainAnnotation) {
345
-
return plainAnnotation.textContent || null
352
-
* Fallback sanitizer that strips all HTML tags
354
-
function sanitizeTextOnly(html: string): string {
355
-
return html.replace(/<[^>]*>/g, '')