this repo has no description
at main 2.1 kB view raw
1import { debug } from 'debug'; 2import { Readability } from '@mozilla/readability'; 3import { Window } from 'happy-dom'; 4 5const log = debug('llms-txt-gen:dom'); 6 7export function parseBody(url: URL, html: string): Document { 8 const window = new Window({ url: `${url}` }); 9 const document = window.document; 10 document.body.innerHTML = html; 11 return document as any; 12} 13 14export interface Content { 15 title: string; 16 html: string; 17} 18 19export function extractContent(document: Document): Content | undefined { 20 const readability = new Readability(document, { 21 nbTopCandidates: 10, 22 charThreshold: 100, 23 disableJSONLD: true, 24 }); 25 const result = readability.parse(); 26 let content = result?.content; 27 if (!content) { 28 content = document.querySelector('article')?.innerHTML; 29 if (!content) { 30 content = document.querySelector('main')?.innerHTML; 31 } 32 log('extracted fallback', document.location.pathname); 33 } else { 34 log('extracted readability', document.location.pathname); 35 } 36 return content ? { 37 title: result?.title || document.title, 38 html: content, 39 } : undefined; 40} 41 42export function extractLinks(document: Document): URL[] { 43 const currentUrl = new URL(document.location.href); 44 const maybeToHref = (element: Element): string | null => { 45 if (element.tagName !== 'A') 46 return null; 47 const link = element as HTMLAnchorElement; 48 if (link.href.startsWith('#')) 49 return null; 50 const href = new URL(link.href, document.location.href); 51 if (currentUrl.pathname === href.pathname) 52 return null; 53 if (href.origin !== currentUrl.origin) 54 return null; 55 href.hash = ''; 56 return href.pathname; 57 }; 58 const inNavAnchors = Array.from(document.querySelectorAll('nav a')) 59 .map(maybeToHref) 60 .filter((x): x is string => !!x); 61 const outNavAnchors = Array.from(document.querySelectorAll(':not(nav) a')) 62 .map(maybeToHref) 63 .filter((x): x is string => !!x); 64 const pathnames = [...new Set([...outNavAnchors, ...inNavAnchors])]; 65 return pathnames.map((pathname) => new URL(pathname, currentUrl)); 66}