patch of Render original HTML text of posts bridged from the Fediverse or Wafrn · round #1 · pull #26 · jollywhoppers.com/witchsky.app

-356

src/lib/strings/html-sanitizer.ts

···

       1
       -
       /**

     

       2
       -
        * HTML sanitizer inspired by Mastodon's Sanitize::Config

     

       3
       -
        * Sanitizes HTML content to prevent XSS while preserving safe formatting

     

       4
       -
        */

     

       5
       -
       

     

       6
       -
       const HTTP_PROTOCOLS = ['http', 'https']

     

       7
       -
       

     

       8
       -
       const LINK_PROTOCOLS = [

     

       9
       -
         'http',

     

       10
       -
         'https',

     

       11
       -
         'dat',

     

       12
       -
         'dweb',

     

       13
       -
         'ipfs',

     

       14
       -
         'ipns',

     

       15
       -
         'ssb',

     

       16
       -
         'gopher',

     

       17
       -
         'xmpp',

     

       18
       -
         'magnet',

     

       19
       -
         'gemini',

     

       20
       -
       ]

     

       21
       -
       

     

       22
       -
       const PROTOCOL_REGEX = /^([a-z][a-z0-9.+-]*):\/\//i

     

       23
       -
       

     

       24
       -
       interface SanitizeOptions {

     

       25
       -
         allowOembed?: boolean

     

       26
       -
       }

     

       27
       -
       

     

       28
       -
       /**

     

       29
       -
        * Sanitizes HTML content following Mastodon's strict rules

     

       30
       -
        */

     

       31
       -
       export function sanitizeHtml(

     

       32
       -
         html: string,

     

       33
       -
         options: SanitizeOptions = {},

     

       34
       -
       ): string {

     

       35
       -
         if (typeof DOMParser === 'undefined') {

     

       36
       -
           // Fallback for environments without DOMParser

     

       37
       -
           return sanitizeTextOnly(html)

     

       38
       -
         }

     

       39
       -
       

     

       40
       -
         const parser = new DOMParser()

     

       41
       -
         const doc = parser.parseFromString(html, 'text/html')

     

       42
       -
         const body = doc.body

     

       43
       -
       

     

       44
       -
         sanitizeNode(body, options)

     

       45
       -
       

     

       46
       -
         return body.innerHTML

     

       47
       -
       }

     

       48
       -
       

     

       49
       -
       function sanitizeNode(node: Node, options: SanitizeOptions): void {

     

       50
       -
         const childNodes = Array.from(node.childNodes)

     

       51
       -
       

     

       52
       -
         for (const child of childNodes) {

     

       53
       -
           if (child.nodeType === Node.ELEMENT_NODE) {

     

       54
       -
             const element = child as HTMLElement

     

       55
       -
             const tagName = element.tagName.toLowerCase()

     

       56
       -
       

     

       57
       -
             // Define allowed elements

     

       58
       -
             const allowedElements = options.allowOembed

     

       59
       -
               ? [

     

       60
       -
                   'p',

     

       61
       -
                   'br',

     

       62
       -
                   'span',

     

       63
       -
                   'a',

     

       64
       -
                   'del',

     

       65
       -
                   's',

     

       66
       -
                   'pre',

     

       67
       -
                   'blockquote',

     

       68
       -
                   'code',

     

       69
       -
                   'b',

     

       70
       -
                   'strong',

     

       71
       -
                   'u',

     

       72
       -
                   'i',

     

       73
       -
                   'em',

     

       74
       -
                   'ul',

     

       75
       -
                   'ol',

     

       76
       -
                   'li',

     

       77
       -
                   'ruby',

     

       78
       -
                   'rt',

     

       79
       -
                   'rp',

     

       80
       -
                   'audio',

     

       81
       -
                   'iframe',

     

       82
       -
                   'source',

     

       83
       -
                   'video',

     

       84
       -
                 ]

     

       85
       -
               : [

     

       86
       -
                   'p',

     

       87
       -
                   'br',

     

       88
       -
                   'span',

     

       89
       -
                   'a',

     

       90
       -
                   'del',

     

       91
       -
                   's',

     

       92
       -
                   'pre',

     

       93
       -
                   'blockquote',

     

       94
       -
                   'code',

     

       95
       -
                   'b',

     

       96
       -
                   'strong',

     

       97
       -
                   'u',

     

       98
       -
                   'i',

     

       99
       -
                   'em',

     

       100
       -
                   'ul',

     

       101
       -
                   'ol',

     

       102
       -
                   'li',

     

       103
       -
                   'ruby',

     

       104
       -
                   'rt',

     

       105
       -
                   'rp',

     

       106
       -
                 ]

     

       107
       -
       

     

       108
       -
             // Handle unsupported elements (h1-h6) - convert to <strong> wrapped in <p>

     

       109
       -
             if (['h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(tagName)) {

     

       110
       -
               const strong = element.ownerDocument!.createElement('strong')

     

       111
       -
               while (element.firstChild) {

     

       112
       -
                 strong.appendChild(element.firstChild)

     

       113
       -
               }

     

       114
       -
               const p = element.ownerDocument!.createElement('p')

     

       115
       -
               p.appendChild(strong)

     

       116
       -
               element.replaceWith(p)

     

       117
       -
               sanitizeNode(p, options)

     

       118
       -
               continue

     

       119
       -
             }

     

       120
       -
       

     

       121
       -
             // Handle math elements - extract annotation text

     

       122
       -
             if (tagName === 'math') {

     

       123
       -
               const mathText = extractMathAnnotation(element)

     

       124
       -
               if (mathText) {

     

       125
       -
                 const textNode = element.ownerDocument!.createTextNode(mathText)

     

       126
       -
                 element.replaceWith(textNode)

     

       127
       -
               } else {

     

       128
       -
                 element.remove()

     

       129
       -
               }

     

       130
       -
               continue

     

       131
       -
             }

     

       132
       -
       

     

       133
       -
             if (tagName === 'li') {

     

       134
       -
               // Keep li elements but sanitize their children

     

       135
       -
               sanitizeNode(element, options)

     

       136
       -
               continue

     

       137
       -
             }

     

       138
       -
       

     

       139
       -
             // Remove elements not in allowlist

     

       140
       -
             if (!allowedElements.includes(tagName)) {

     

       141
       -
               // Replace with text content

     

       142
       -
               const textNode = element.ownerDocument!.createTextNode(

     

       143
       -
                 element.textContent || '',

     

       144
       -
               )

     

       145
       -
               element.replaceWith(textNode)

     

       146
       -
               continue

     

       147
       -
             }

     

       148
       -
       

     

       149
       -
             // Sanitize attributes

     

       150
       -
             sanitizeAttributes(element, options)

     

       151
       -
       

     

       152
       -
             // Recursively sanitize children

     

       153
       -
             sanitizeNode(element, options)

     

       154
       -
           }

     

       155
       -
         }

     

       156
       -
       }

     

       157
       -
       

     

       158
       -
       function sanitizeAttributes(

     

       159
       -
         element: HTMLElement,

     

       160
       -
         options: SanitizeOptions,

     

       161
       -
       ): void {

     

       162
       -
         const tagName = element.tagName.toLowerCase()

     

       163
       -
         const allowedAttrs: Record<string, string[]> = {

     

       164
       -
           a: ['href', 'rel', 'class', 'translate'],

     

       165
       -
           span: ['class', 'translate'],

     

       166
       -
           ol: ['start', 'reversed'],

     

       167
       -
           li: ['value'],

     

       168
       -
           p: ['class'],

     

       169
       -
         }

     

       170
       -
       

     

       171
       -
         if (options.allowOembed) {

     

       172
       -
           allowedAttrs.audio = ['controls']

     

       173
       -
           allowedAttrs.iframe = [

     

       174
       -
             'allowfullscreen',

     

       175
       -
             'frameborder',

     

       176
       -
             'height',

     

       177
       -
             'scrolling',

     

       178
       -
             'src',

     

       179
       -
             'width',

     

       180
       -
           ]

     

       181
       -
           allowedAttrs.source = ['src', 'type']

     

       182
       -
           allowedAttrs.video = ['controls', 'height', 'loop', 'width']

     

       183
       -
         }

     

       184
       -
       

     

       185
       -
         const allowed = allowedAttrs[tagName] || []

     

       186
       -
         const attrs = Array.from(element.attributes)

     

       187
       -
       

     

       188
       -
         // Remove non-allowed attributes

     

       189
       -
         for (const attr of attrs) {

     

       190
       -
           const attrName = attr.name.toLowerCase()

     

       191
       -
           const isAllowed = allowed.some(a => {

     

       192
       -
             if (a.endsWith('*')) {

     

       193
       -
               return attrName.startsWith(a.slice(0, -1))

     

       194
       -
             }

     

       195
       -
             return a === attrName

     

       196
       -
           })

     

       197
       -
       

     

       198
       -
           if (!isAllowed) {

     

       199
       -
             element.removeAttribute(attr.name)

     

       200
       -
           }

     

       201
       -
         }

     

       202
       -
       

     

       203
       -
         // Process specific attributes

     

       204
       -
         if (tagName === 'a') {

     

       205
       -
           processAnchorElement(element)

     

       206
       -
         }

     

       207
       -
       

     

       208
       -
         // Process class whitelist

     

       209
       -
         if (element.hasAttribute('class')) {

     

       210
       -
           processClassWhitelist(element)

     

       211
       -
         }

     

       212
       -
       

     

       213
       -
         // Process translate attribute - remove unless it's "no"

     

       214
       -
         if (element.hasAttribute('translate')) {

     

       215
       -
           const translate = element.getAttribute('translate')

     

       216
       -
           if (translate !== 'no') {

     

       217
       -
             element.removeAttribute('translate')

     

       218
       -
           }

     

       219
       -
         }

     

       220
       -
       

     

       221
       -
         // Validate protocols for elements with src/href

     

       222
       -
         if (element.hasAttribute('href') || element.hasAttribute('src')) {

     

       223
       -
           validateProtocols(element, options)

     

       224
       -
         }

     

       225
       -
       }

     

       226
       -
       

     

       227
       -
       function processAnchorElement(element: HTMLElement): void {

     

       228
       -
         // Add required attributes

     

       229
       -
         element.setAttribute('rel', 'nofollow noopener')

     

       230
       -
         element.setAttribute('target', '_blank')

     

       231
       -
       

     

       232
       -
         // Check if href has unsupported protocol

     

       233
       -
         const href = element.getAttribute('href')

     

       234
       -
         if (href) {

     

       235
       -
           const scheme = getScheme(href)

     

       236
       -
           if (scheme !== null && scheme !== 'relative' && !LINK_PROTOCOLS.includes(scheme)) {

     

       237
       -
             // Replace element with its text content

     

       238
       -
             const textNode = element.ownerDocument!.createTextNode(

     

       239
       -
               element.textContent || '',

     

       240
       -
             )

     

       241
       -
             element.replaceWith(textNode)

     

       242
       -
           }

     

       243
       -
         }

     

       244
       -
       }

     

       245
       -
       

     

       246
       -
       function processClassWhitelist(element: HTMLElement): void {

     

       247
       -
         const classList = element.className.split(/[\t\n\f\r ]+/).filter(Boolean)

     

       248
       -
         const whitelisted = classList.filter(className => {

     

       249
       -
           // microformats classes

     

       250
       -
           if (/^[hpuedt]-/.test(className)) return true

     

       251
       -
           // semantic classes

     

       252
       -
           if (/^(mention|hashtag)$/.test(className)) return true

     

       253
       -
           // link formatting classes

     

       254
       -
           if (/^(ellipsis|invisible)$/.test(className)) return true

     

       255
       -
           // quote inline class

     

       256
       -
           if (className === 'quote-inline') return true

     

       257
       -
           return false

     

       258
       -
         })

     

       259
       -
       

     

       260
       -
         if (whitelisted.length > 0) {

     

       261
       -
           element.className = whitelisted.join(' ')

     

       262
       -
         } else {

     

       263
       -
           element.removeAttribute('class')

     

       264
       -
         }

     

       265
       -
       }

     

       266
       -
       

     

       267
       -
       function validateProtocols(

     

       268
       -
         element: HTMLElement,

     

       269
       -
         options: SanitizeOptions,

     

       270
       -
       ): void {

     

       271
       -
         const tagName = element.tagName.toLowerCase()

     

       272
       -
         const src = element.getAttribute('src')

     

       273
       -
         const href = element.getAttribute('href')

     

       274
       -
         const url = src || href

     

       275
       -
       

     

       276
       -
         if (!url) return

     

       277
       -
       

     

       278
       -
         const scheme = getScheme(url)

     

       279
       -
       

     

       280
       -
         // For oembed elements, only allow HTTP protocols for src

     

       281
       -
         if (

     

       282
       -
           options.allowOembed &&

     

       283
       -
           src &&

     

       284
       -
           ['iframe', 'source'].includes(tagName)

     

       285
       -
         ) {

     

       286
       -
           if (scheme !== null && !HTTP_PROTOCOLS.includes(scheme)) {

     

       287
       -
             element.removeAttribute('src')

     

       288
       -
           }

     

       289
       -
           // Add sandbox attribute to iframes

     

       290
       -
           if (tagName === 'iframe') {

     

       291
       -
             element.setAttribute(

     

       292
       -
               'sandbox',

     

       293
       -
               'allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox allow-forms',

     

       294
       -
             )

     

       295
       -
           }

     

       296
       -
         }

     

       297
       -
       }

     

       298
       -
       

     

       299
       -
       function getScheme(url: string): string | null {

     

       300
       -
         const match = url.match(PROTOCOL_REGEX)

     

       301
       -
         if (match) {

     

       302
       -
           return match[1].toLowerCase()

     

       303
       -
         }

     

       304
       -
         // Check if it's a relative URL

     

       305
       -
         if (url.startsWith('/') || url.startsWith('.')) {

     

       306
       -
           return 'relative'

     

       307
       -
         }

     

       308
       -
         return null

     

       309
       -
       }

     

       310
       -
       

     

       311
       -
       /**

     

       312
       -
        * Extract math annotation from MathML element

     

       313
       -
        * Follows FEP-dc88 spec for math element representation

     

       314
       -
        */

     

       315
       -
       function extractMathAnnotation(mathElement: HTMLElement): string | null {

     

       316
       -
         const semantics = Array.from(mathElement.children).find(

     

       317
       -
           child => child.tagName.toLowerCase() === 'semantics',

     

       318
       -
         ) as HTMLElement | undefined

     

       319
       -
       

     

       320
       -
         if (!semantics) return null

     

       321
       -
       

     

       322
       -
         // Look for LaTeX annotation (application/x-tex)

     

       323
       -
         const latexAnnotation = Array.from(semantics.children).find(child => {

     

       324
       -
           return (

     

       325
       -
             child.tagName.toLowerCase() === 'annotation' &&

     

       326
       -
             child.getAttribute('encoding') === 'application/x-tex'

     

       327
       -
           )

     

       328
       -
         })

     

       329
       -
       

     

       330
       -
         if (latexAnnotation) {

     

       331
       -
           const display = mathElement.getAttribute('display')

     

       332
       -
           const text = latexAnnotation.textContent || ''

     

       333
       -
           return display === 'block' ? `$$${text}$$` : `$${text}$`

     

       334
       -
         }

     

       335
       -
       

     

       336
       -
         // Look for plain text annotation

     

       337
       -
         const plainAnnotation = Array.from(semantics.children).find(child => {

     

       338
       -
           return (

     

       339
       -
             child.tagName.toLowerCase() === 'annotation' &&

     

       340
       -
             child.getAttribute('encoding') === 'text/plain'

     

       341
       -
           )

     

       342
       -
         })

     

       343
       -
       

     

       344
       -
         if (plainAnnotation) {

     

       345
       -
           return plainAnnotation.textContent || null

     

       346
       -
         }

     

       347
       -
       

     

       348
       -
         return null

     

       349
       -
       }

     

       350
       -
       

     

       351
       -
       /**

     

       352
       -
        * Fallback sanitizer that strips all HTML tags

     

       353
       -
        */

     

       354
       -
       function sanitizeTextOnly(html: string): string {

     

       355
       -
         return html.replace(/<[^>]*>/g, '')

     

       356
       -
       }

+56 -11

src/components/Post/MastodonHtmlContent.tsx

···

       1
       -
       import {useMemo} from 'react'

     

       2
       -
       import {type StyleProp, type TextStyle, View, type ViewStyle} from 'react-native'

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       3
        
       import {type AppBskyFeedPost} from '@atproto/api'

     

       0
        
       
     

       0
        
       
     

       4
        
       

     

       5
        
       import {useRenderMastodonHtml} from '#/state/preferences/render-mastodon-html'

     

       6
       -
       import { atoms } from '#/alf'

     

       0
        
       
     

       7
        
       import {InlineLinkText} from '#/components/Link'

     

       8
        
       import {P, Text} from '#/components/Typography'

     

       9
        
       

     
···

       36
        
         numberOfLines,

     

       37
        
       }: MastodonHtmlContentProps) {

     

       38
        
         const renderMastodonHtml = useRenderMastodonHtml()

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       39
        
       

     

       40
        
         const renderedContent = useMemo(() => {

     

       41
        
           if (!renderMastodonHtml) return null

     
···

       53
        
           return sanitizeAndRenderHtml(rawHtml, numberOfLines, textStyle)

     

       54
        
         }, [record, renderMastodonHtml, numberOfLines, textStyle])

     

       55
        
       

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       56
        
         if (!renderedContent) return null

     

       57
        
       

     

       58
       -
         return <View style={style}>{renderedContent}</View>

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       59
        
       }

     

       60
        
       

     

       61
        
       const LINK_PROTOCOLS = [

     
···

       111
        
         const doc = parser.parseFromString(html, 'text/html')

     

       112
        
       

     

       113
        
         const textStyle: StyleProp<TextStyle> = [

     

       114
       -
           atoms.leading_snug,

     

       115
       -
           atoms.text_md,

     

       116
        
           inputTextStyle,

     

       117
        
         ]

     

       118
        
       

     
···

       300
        
         }

     

       301
        
       

     

       302
        
         const content = Array.from(doc.body.childNodes).map((node, i) =>

     

       303
       -
           renderNode(node, i),

     

       304
        
         )

     

       305
        
       

     

       306
        
         return (

     
···

       326
        
         // Remove non-allowed attributes

     

       327
        
         for (const attr of attrs) {

     

       328
        
           const attrName = attr.name.toLowerCase()

     

       329
       -
           const isAllowed = allowed.some(a => {

     

       330
       -
             if (a.endsWith('*')) {

     

       331
       -
               return attrName.startsWith(a.slice(0, -1))

     

       332
        
             }

     

       333
       -
             return a === attrName

     

       334
        
           })

     

       335
        
       

     

       336
        
           if (!isAllowed) {

···

       1
       +
       import {useMemo, useState} from 'react'

     

       2
       +
       import {

     

       3
       +
         type LayoutChangeEvent,

     

       4
       +
         type StyleProp,

     

       5
       +
         type TextStyle,

     

       6
       +
         View,

     

       7
       +
         type ViewStyle,

     

       8
       +
       } from 'react-native'

     

       9
        
       import {type AppBskyFeedPost} from '@atproto/api'

     

       10
       +
       import {msg, Trans} from '@lingui/macro'

     

       11
       +
       import {useLingui} from '@lingui/react'

     

       12
        
       

     

       13
        
       import {useRenderMastodonHtml} from '#/state/preferences/render-mastodon-html'

     

       14
       +
       import {atoms as a} from '#/alf'

     

       15
       +
       import {Button, ButtonText} from '#/components/Button'

     

       16
        
       import {InlineLinkText} from '#/components/Link'

     

       17
        
       import {P, Text} from '#/components/Typography'

     

       18
        
       

     
···

       45
        
         numberOfLines,

     

       46
        
       }: MastodonHtmlContentProps) {

     

       47
        
         const renderMastodonHtml = useRenderMastodonHtml()

     

       48
       +
         const {_} = useLingui()

     

       49
       +
         const [isExpanded, setIsExpanded] = useState(false)

     

       50
       +
         const [contentHeight, setContentHeight] = useState<number | null>(null)

     

       51
       +
         const [isTall, setIsTall] = useState(false)

     

       52
        
       

     

       53
        
         const renderedContent = useMemo(() => {

     

       54
        
           if (!renderMastodonHtml) return null

     
···

       66
        
           return sanitizeAndRenderHtml(rawHtml, numberOfLines, textStyle)

     

       67
        
         }, [record, renderMastodonHtml, numberOfLines, textStyle])

     

       68
        
       

     

       69
       +
         const handleLayout = (event: LayoutChangeEvent) => {

     

       70
       +
           const height = event.nativeEvent.layout.height

     

       71
       +
           if (contentHeight === null) {

     

       72
       +
             setContentHeight(height)

     

       73
       +
             // Consider content "tall" if it's taller than 150px

     

       74
       +
             setIsTall(height > 150)

     

       75
       +
           }

     

       76
       +
         }

     

       77
       +
       

     

       78
        
         if (!renderedContent) return null

     

       79
        
       

     

       80
       +
         const shouldCollapse = isTall && !isExpanded

     

       81
       +
       

     

       82
       +
         return (

     

       83
       +
           <View style={style}>

     

       84
       +
             <View

     

       85
       +
               style={shouldCollapse ? {maxHeight: 150, overflow: 'hidden'} : undefined}

     

       86
       +
               onLayout={handleLayout}>

     

       87
       +
               {renderedContent}

     

       88
       +
             </View>

     

       89
       +
             {shouldCollapse && (

     

       90
       +
               <Button

     

       91
       +
                 label={_(msg`Show more`)}

     

       92
       +
                 onPress={() => setIsExpanded(true)}

     

       93
       +
                 variant="ghost"

     

       94
       +
                 color="primary"

     

       95
       +
                 size="small"

     

       96
       +
                 style={[a.mt_xs]}>

     

       97
       +
                 <ButtonText>

     

       98
       +
                   <Trans>Show more</Trans>

     

       99
       +
                 </ButtonText>

     

       100
       +
               </Button>

     

       101
       +
             )}

     

       102
       +
           </View>

     

       103
       +
         )

     

       104
        
       }

     

       105
        
       

     

       106
        
       const LINK_PROTOCOLS = [

     
···

       156
        
         const doc = parser.parseFromString(html, 'text/html')

     

       157
        
       

     

       158
        
         const textStyle: StyleProp<TextStyle> = [

     

       159
       +
           a.leading_snug,

     

       160
       +
           a.text_md,

     

       161
        
           inputTextStyle,

     

       162
        
         ]

     

       163
        
       

     
···

       345
        
         }

     

       346
        
       

     

       347
        
         const content = Array.from(doc.body.childNodes).map((node, i) =>

     

       348
       +
           renderNode(node, String(i)),

     

       349
        
         )

     

       350
        
       

     

       351
        
         return (

     
···

       371
        
         // Remove non-allowed attributes

     

       372
        
         for (const attr of attrs) {

     

       373
        
           const attrName = attr.name.toLowerCase()

     

       374
       +
           const isAllowed = allowed.some(allowedAttr => {

     

       375
       +
             if (allowedAttr.endsWith('*')) {

     

       376
       +
               return attrName.startsWith(allowedAttr.slice(0, -1))

     

       377
        
             }

     

       378
       +
             return allowedAttr === attrName

     

       379
        
           })

     

       380
        
       

     

       381
        
           if (!isAllowed) {