···
-
* HTML sanitizer inspired by Mastodon's Sanitize::Config
-
* Sanitizes HTML content to prevent XSS while preserving safe formatting
-
const HTTP_PROTOCOLS = ['http', 'https']
-
const LINK_PROTOCOLS = [
-
const PROTOCOL_REGEX = /^([a-z][a-z0-9.+-]*):\/\//i
-
interface SanitizeOptions {
-
* Sanitizes HTML content following Mastodon's strict rules
-
export function sanitizeHtml(
-
options: SanitizeOptions = {},
-
if (typeof DOMParser === 'undefined') {
-
// Fallback for environments without DOMParser
-
return sanitizeTextOnly(html)
-
const parser = new DOMParser()
-
const doc = parser.parseFromString(html, 'text/html')
-
sanitizeNode(body, options)
-
function sanitizeNode(node: Node, options: SanitizeOptions): void {
-
const childNodes = Array.from(node.childNodes)
-
for (const child of childNodes) {
-
if (child.nodeType === Node.ELEMENT_NODE) {
-
const element = child as HTMLElement
-
const tagName = element.tagName.toLowerCase()
-
// Define allowed elements
-
const allowedElements = options.allowOembed
-
// Handle unsupported elements (h1-h6) - convert to <strong> wrapped in <p>
-
if (['h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(tagName)) {
-
const strong = element.ownerDocument!.createElement('strong')
-
while (element.firstChild) {
-
strong.appendChild(element.firstChild)
-
const p = element.ownerDocument!.createElement('p')
-
sanitizeNode(p, options)
-
// Handle math elements - extract annotation text
-
if (tagName === 'math') {
-
const mathText = extractMathAnnotation(element)
-
const textNode = element.ownerDocument!.createTextNode(mathText)
-
element.replaceWith(textNode)
-
if (tagName === 'li') {
-
// Keep li elements but sanitize their children
-
sanitizeNode(element, options)
-
// Remove elements not in allowlist
-
if (!allowedElements.includes(tagName)) {
-
// Replace with text content
-
const textNode = element.ownerDocument!.createTextNode(
-
element.textContent || '',
-
element.replaceWith(textNode)
-
sanitizeAttributes(element, options)
-
// Recursively sanitize children
-
sanitizeNode(element, options)
-
function sanitizeAttributes(
-
options: SanitizeOptions,
-
const tagName = element.tagName.toLowerCase()
-
const allowedAttrs: Record<string, string[]> = {
-
a: ['href', 'rel', 'class', 'translate'],
-
span: ['class', 'translate'],
-
ol: ['start', 'reversed'],
-
if (options.allowOembed) {
-
allowedAttrs.audio = ['controls']
-
allowedAttrs.iframe = [
-
allowedAttrs.source = ['src', 'type']
-
allowedAttrs.video = ['controls', 'height', 'loop', 'width']
-
const allowed = allowedAttrs[tagName] || []
-
const attrs = Array.from(element.attributes)
-
// Remove non-allowed attributes
-
for (const attr of attrs) {
-
const attrName = attr.name.toLowerCase()
-
const isAllowed = allowed.some(a => {
-
return attrName.startsWith(a.slice(0, -1))
-
element.removeAttribute(attr.name)
-
// Process specific attributes
-
processAnchorElement(element)
-
// Process class whitelist
-
if (element.hasAttribute('class')) {
-
processClassWhitelist(element)
-
// Process translate attribute - remove unless it's "no"
-
if (element.hasAttribute('translate')) {
-
const translate = element.getAttribute('translate')
-
if (translate !== 'no') {
-
element.removeAttribute('translate')
-
// Validate protocols for elements with src/href
-
if (element.hasAttribute('href') || element.hasAttribute('src')) {
-
validateProtocols(element, options)
-
function processAnchorElement(element: HTMLElement): void {
-
// Add required attributes
-
element.setAttribute('rel', 'nofollow noopener')
-
element.setAttribute('target', '_blank')
-
// Check if href has unsupported protocol
-
const href = element.getAttribute('href')
-
const scheme = getScheme(href)
-
if (scheme !== null && scheme !== 'relative' && !LINK_PROTOCOLS.includes(scheme)) {
-
// Replace element with its text content
-
const textNode = element.ownerDocument!.createTextNode(
-
element.textContent || '',
-
element.replaceWith(textNode)
-
function processClassWhitelist(element: HTMLElement): void {
-
const classList = element.className.split(/[\t\n\f\r ]+/).filter(Boolean)
-
const whitelisted = classList.filter(className => {
-
// microformats classes
-
if (/^[hpuedt]-/.test(className)) return true
-
if (/^(mention|hashtag)$/.test(className)) return true
-
// link formatting classes
-
if (/^(ellipsis|invisible)$/.test(className)) return true
-
if (className === 'quote-inline') return true
-
if (whitelisted.length > 0) {
-
element.className = whitelisted.join(' ')
-
element.removeAttribute('class')
-
function validateProtocols(
-
options: SanitizeOptions,
-
const tagName = element.tagName.toLowerCase()
-
const src = element.getAttribute('src')
-
const href = element.getAttribute('href')
-
const url = src || href
-
const scheme = getScheme(url)
-
// For oembed elements, only allow HTTP protocols for src
-
['iframe', 'source'].includes(tagName)
-
if (scheme !== null && !HTTP_PROTOCOLS.includes(scheme)) {
-
element.removeAttribute('src')
-
// Add sandbox attribute to iframes
-
if (tagName === 'iframe') {
-
'allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox allow-forms',
-
function getScheme(url: string): string | null {
-
const match = url.match(PROTOCOL_REGEX)
-
return match[1].toLowerCase()
-
// Check if it's a relative URL
-
if (url.startsWith('/') || url.startsWith('.')) {
-
* Extract math annotation from MathML element
-
* Follows FEP-dc88 spec for math element representation
-
function extractMathAnnotation(mathElement: HTMLElement): string | null {
-
const semantics = Array.from(mathElement.children).find(
-
child => child.tagName.toLowerCase() === 'semantics',
-
) as HTMLElement | undefined
-
if (!semantics) return null
-
// Look for LaTeX annotation (application/x-tex)
-
const latexAnnotation = Array.from(semantics.children).find(child => {
-
child.tagName.toLowerCase() === 'annotation' &&
-
child.getAttribute('encoding') === 'application/x-tex'
-
const display = mathElement.getAttribute('display')
-
const text = latexAnnotation.textContent || ''
-
return display === 'block' ? `$$${text}$$` : `$${text}$`
-
// Look for plain text annotation
-
const plainAnnotation = Array.from(semantics.children).find(child => {
-
child.tagName.toLowerCase() === 'annotation' &&
-
child.getAttribute('encoding') === 'text/plain'
-
return plainAnnotation.textContent || null
-
* Fallback sanitizer that strips all HTML tags
-
function sanitizeTextOnly(html: string): string {
-
return html.replace(/<[^>]*>/g, '')