this repo has no description
at main 7.1 kB view raw
1import type { Heading, List, ListItem, PhrasingContent, Root } from 'mdast'; 2import { unified } from 'unified'; 3import { visit } from 'unist-util-visit'; 4import { defaultSchema as defaultSanitizeSchema } from 'hast-util-sanitize'; 5import rehypeParse from 'rehype-parse'; 6import rehypeSanitize from 'rehype-sanitize'; 7import rehypeStringify from 'rehype-stringify'; 8import rehypeRemark from 'rehype-remark'; 9import remarkStringify from 'remark-stringify'; 10import remarkUnlink from 'remark-unlink'; 11import remarkNormalizeHeadings from 'remark-normalize-headings'; 12import remarkGfm from 'remark-gfm'; 13import remarkParse from 'remark-parse'; 14import remarkSqueezeParagraphs from 'remark-squeeze-paragraphs'; 15 16const toString = (nodes: PhrasingContent[]): string => 17 nodes.map((node) => { 18 switch (node.type) { 19 case 'break': 20 return '\n'; 21 case 'delete': 22 case 'emphasis': 23 case 'link': 24 case 'strong': 25 return toString(node.children); 26 case 'inlineCode': 27 return `\`${node.value}\``; 28 case 'text': 29 return node.value; 30 case 'footnoteReference': 31 case 'html': 32 case 'image': 33 case 'imageReference': 34 case 'linkReference': 35 default: 36 return ''; 37 } 38 }).join(''); 39 40export async function sanitizeHtml(html: string): Promise<string> { 41 const vfile = await unified() 42 .use(rehypeParse) 43 .use(rehypeSanitize, { 44 tagNames: [ 45 ...defaultSanitizeSchema.tagNames!.filter((tag) => tag !== 'details'), 46 'content-region', 47 'footer', 48 'header', 49 'main', 50 'article', 51 'section', 52 'nav', 53 ], 54 strip: ['script', 'style', 'details'], 55 }) 56 .use(rehypeStringify) 57 .process(html) 58 return vfile.toString(); 59} 60 61export async function htmlToMarkdown(content: { 62 title: string; 63 html: string; 64}): Promise<string> { 65 function remarkDisqualify() { 66 return function (tree: Root) { 67 visit(tree, function (node, index, parent) { 68 if (!parent || typeof index !== 'number') { 69 return; 70 } else if (node.type === 'thematicBreak' && parent) { 71 parent.children.splice(index, 1); 72 } else if (node.type === 'table' && parent) { 73 if (node.children.length === 2) { 74 const heading = node.children[0]!; 75 const items = node.children[1]!; 76 const zip = heading.children.map((headRow, idx) => { 77 const itemRow = items.children[idx]!; 78 return { 79 type: 'listItem', 80 spread: false, 81 children: [ 82 ...headRow.children, 83 ...itemRow.children, 84 ], 85 } as ListItem; 86 }); 87 parent.children.splice(index, 1, { 88 type: 'list', 89 spread: false, 90 children: zip, 91 } as List); 92 } 93 } else if (node.type === 'image' || node.type === 'imageReference') { 94 parent.children.splice(index, 1); 95 } else if (node.type === 'link' || node.type === 'linkReference') { 96 if (node.children.length === 0) 97 parent.children.splice(index, 1); 98 } else if (node.type === 'html') { 99 parent.children.splice(index, 1); 100 } else if (node.type === 'heading') { 101 const child = node.children[0]; 102 if (node.children.length === 0) 103 parent.children.splice(index, 1); 104 if (node.children.length > 1 || !child || child.type !== 'text') 105 return; 106 const value = child.value.trim(); 107 switch (value) { 108 case 'Example': 109 case 'Remarks': 110 case 'Note': 111 case '': 112 parent.children.splice(index, 1); 113 break; 114 default: 115 return; 116 } 117 } else if (node.type === 'text') { 118 if (!parent || parent.type !== 'paragraph' || parent.children.length > 1) 119 return; 120 const value = node.value.trim(); 121 if ( 122 value.startsWith('Last updated on ') || 123 value.startsWith('Copyright ') 124 ) { 125 parent.children.splice(index, 1); 126 return; 127 } 128 switch (value) { 129 case 'Loading...': 130 case 'Caution': 131 case 'tsx': 132 parent.children.splice(index, 1); 133 break; 134 default: 135 return; 136 } 137 } 138 }); 139 }; 140 } 141 const md = await unified() 142 .use(rehypeParse, { fragment: true }) 143 .use(rehypeSanitize, { 144 strip: ['script', 'style', 'nav'], 145 }) 146 .use(remarkGfm, { 147 tablePipeAlign: false, 148 tableCellPadding: false, 149 }) 150 .use(rehypeRemark, { document: false }) 151 .use(remarkDisqualify) 152 .use(remarkUnlink) 153 .use(remarkNormalizeHeadings) 154 .use(remarkSqueezeParagraphs) 155 .use(remarkStringify, { 156 incrementListMarker: false, 157 ruleSpaces: false, 158 tightDefinitions: true, 159 }) 160 .process(content.html); 161 return md.toString().replace(/[\u200B-\u200D\uFEFF]/g, ''); 162} 163 164export async function concatMarkdown( 165 contents: (string | null)[] | Promise<string | null>[] 166): Promise<string> { 167 const md = await unified() 168 .use(remarkParse, { fragment: true }) 169 .use(remarkGfm, { 170 tablePipeAlign: false, 171 tableCellPadding: false, 172 }) 173 .use(remarkNormalizeHeadings) 174 .use(remarkSqueezeParagraphs) 175 .use(remarkStringify, { 176 incrementListMarker: false, 177 ruleSpaces: false, 178 tightDefinitions: true, 179 }) 180 .process( 181 (await Promise.all(contents)).join('\n\n') 182 ); 183 return md.toString(); 184} 185 186function extractTitle(markdown: string): string | null { 187 const tree = unified() 188 .use(remarkParse, { fragment: true }) 189 .use(remarkGfm, { 190 tablePipeAlign: false, 191 tableCellPadding: false, 192 }) 193 .parse(markdown); 194 const node = tree.children[0]; 195 if (node && node.type === 'heading' && node.depth === 1) { 196 return toString(node.children); 197 } else { 198 return null; 199 } 200} 201 202export function remarkTitle(opts: { title: string }) { 203 return function checkTitleTransformer(root: Root) { 204 const node = root.children[0]!; 205 const replacement: Heading = { 206 type: 'heading', 207 depth: 1, 208 children: [ 209 { type: 'text', value: opts.title } 210 ] 211 }; 212 if (node && node.type === 'heading') { 213 node.depth = 1; 214 node.children = replacement.children; 215 } else { 216 root.children?.unshift(replacement); 217 } 218 } 219} 220 221export async function transferTitle(from: string, to: string): Promise<string> { 222 const title = extractTitle(from); 223 if (!title) return to; 224 const md = await unified() 225 .use(remarkParse, { fragment: true }) 226 .use(remarkGfm, { 227 tablePipeAlign: false, 228 tableCellPadding: false, 229 }) 230 .use(remarkTitle, { title }) 231 .use(remarkStringify, { 232 bullet: '-', 233 incrementListMarker: false, 234 ruleSpaces: false, 235 tightDefinitions: true, 236 }) 237 .process(to); 238 return md.toString(); 239}