this repo has no description
1import { debug } from 'debug';
2import { Readability } from '@mozilla/readability';
3import { Window } from 'happy-dom';
4
5const log = debug('llms-txt-gen:dom');
6
7export function parseBody(url: URL, html: string): Document {
8 const window = new Window({ url: `${url}` });
9 const document = window.document;
10 document.body.innerHTML = html;
11 return document as any;
12}
13
14export interface Content {
15 title: string;
16 html: string;
17}
18
19export function extractContent(document: Document): Content | undefined {
20 const readability = new Readability(document, {
21 nbTopCandidates: 10,
22 charThreshold: 100,
23 disableJSONLD: true,
24 });
25 const result = readability.parse();
26 let content = result?.content;
27 if (!content) {
28 content = document.querySelector('article')?.innerHTML;
29 if (!content) {
30 content = document.querySelector('main')?.innerHTML;
31 }
32 log('extracted fallback', document.location.pathname);
33 } else {
34 log('extracted readability', document.location.pathname);
35 }
36 return content ? {
37 title: result?.title || document.title,
38 html: content,
39 } : undefined;
40}
41
42export function extractLinks(document: Document): URL[] {
43 const currentUrl = new URL(document.location.href);
44 const maybeToHref = (element: Element): string | null => {
45 if (element.tagName !== 'A')
46 return null;
47 const link = element as HTMLAnchorElement;
48 if (link.href.startsWith('#'))
49 return null;
50 const href = new URL(link.href, document.location.href);
51 if (currentUrl.pathname === href.pathname)
52 return null;
53 if (href.origin !== currentUrl.origin)
54 return null;
55 href.hash = '';
56 return href.pathname;
57 };
58 const inNavAnchors = Array.from(document.querySelectorAll('nav a'))
59 .map(maybeToHref)
60 .filter((x): x is string => !!x);
61 const outNavAnchors = Array.from(document.querySelectorAll(':not(nav) a'))
62 .map(maybeToHref)
63 .filter((x): x is string => !!x);
64 const pathnames = [...new Set([...outNavAnchors, ...inNavAnchors])];
65 return pathnames.map((pathname) => new URL(pathname, currentUrl));
66}