this repo has no description
1import { debug } from 'debug';
2import * as fs from 'node:fs/promises';
3import * as path from 'node:path';
4import { URLPattern } from 'urlpattern-polyfill/urlpattern';
5import { WeightedDiGraph, KruskalMST, Edge } from 'js-graph-algorithms';
6import { extractContent, extractLinks, parseBody } from "./dom";
7import { fetchHtml } from "./fetch";
8import { htmlToMarkdown, sanitizeHtml, transferTitle } from "./unified";
9import { rewriteMarkdown } from './rewrite';
10import { makeCacheFileHelper } from './path';
11
12const log = debug('llms-txt-gen:graph');
13
14export interface CrawlOptions {
15 baseURL: URL | string;
16 include?: string[];
17 exclude?: string[];
18}
19
20const cacheDir = path.join(process.cwd(), '.cache/page');
21await fs.mkdir(cacheDir, { recursive: true });
22const getCacheFile = makeCacheFileHelper(cacheDir, '.md');
23
24async function extractContentToMarkdown(url: URL, html: string): Promise<string | null> {
25 const cacheFile = await getCacheFile(url);
26 try {
27 const content = await fs.readFile(cacheFile, 'utf-8');
28 if (content) {
29 log('extracted output from cache', url.pathname);
30 return content;
31 }
32 } catch {}
33 log('extracting content', url.pathname);
34 const doc = parseBody(url, html);
35 const output = extractContent(doc);
36 if (output) {
37 const markdown = await htmlToMarkdown(output);
38 await fs.writeFile(cacheFile, markdown, 'utf-8');
39 return markdown;
40 } else {
41 return null;
42 }
43}
44
45class Root {
46 baseURL: URL;
47 #include: URLPattern[];
48 #exclude: URLPattern[];
49 #pages: Map<string, Page> = new Map();
50 #origin: Page;
51
52 constructor(opts: CrawlOptions) {
53 const baseURL = new URL(opts.baseURL);
54 const baseURLStr = baseURL.toString();
55 this.baseURL = baseURL;
56 this.#origin = new Page(this, baseURL);
57 this.#pages.set(opts.baseURL.toString(), this.#origin);
58 const toPattern = (pattern: string) => new URLPattern(pattern, baseURLStr);
59 this.#include = opts.include?.map(toPattern) ?? [];
60 this.#exclude = opts.exclude?.map(toPattern) ?? [];
61 }
62
63 isURLAllowed(url: URL): boolean {
64 if (url.origin !== this.baseURL.origin)
65 return false;
66 if (this.#include.length) {
67 if (this.#include.some((pattern) => pattern.test(url))) {
68 return !this.#exclude.some((pattern) => pattern.test(url));
69 } else {
70 return false;
71 }
72 } else if (this.#exclude.length) {
73 return !this.#exclude.some((pattern) => pattern.test(url));
74 } else {
75 return true;
76 }
77 }
78
79 get origin() {
80 return this.#origin;
81 }
82
83 getPage(url: URL) {
84 let page = this.#pages.get(url.toString());
85 if (!page) {
86 page = new Page(this, url);
87 this.#pages.set(url.toString(), page);
88 }
89 return page;
90 }
91
92 async crawlPages(): Promise<Page[]> {
93 return await crawlPages(this.#origin);
94 }
95}
96
97class Page {
98 root: Root;
99 url: URL;
100 isPage = true;
101
102 #html: string | null = null;
103 #content: string | null = null;
104 #links: Page[] | null = null;
105
106 _id?: number;
107
108 constructor(root: Root, url: URL) {
109 this.root = root;
110 this.url = url;
111 }
112
113 async getHTML() {
114 if (this.#html !== null || !this.isPage)
115 return this.#html;
116 const content = await fetchHtml(this.url)
117 if (!content) {
118 this.isPage = false;
119 return (this.#html = null);
120 } else {
121 const sanitized = await sanitizeHtml(content);
122 return (this.#html = sanitized);
123 }
124 }
125
126 async getLinks() {
127 if (this.#links !== null || !this.isPage)
128 return this.#links;
129 const html = await this.getHTML();
130 if (!html) return [];
131 const doc = parseBody(this.url, html);
132 const urls = extractLinks(doc);
133 return urls
134 .filter((url) => this.root.isURLAllowed(url))
135 .map((url) => this.root.getPage(url));
136 }
137
138 async getContent(): Promise<string | null> {
139 if (this.#content !== null || !this.isPage)
140 return this.#content;
141 const html = await this.getHTML();
142 if (!html) return (this.#content = null);
143 const markdown = await extractContentToMarkdown(this.url, html);
144 if (!markdown) return (this.#content = null);
145 const rewritten = await rewriteMarkdown(this.url, markdown);
146 return (this.#content = await transferTitle(markdown, rewritten));
147 }
148}
149
150async function crawlPages(page: Page, visited = new Set<Page>([page]), depth = 1): Promise<Page[]> {
151 const links = await page.getLinks();
152 if (links) {
153 log(`crawling (${links.length} links, depth = ${depth})`, page.url.pathname);
154 for (const link of links) {
155 if (visited.has(link)) {
156 continue;
157 } else {
158 visited.add(page);
159 await crawlPages(link, visited, depth + 1);
160 }
161 }
162 }
163 return [...visited].filter((page) => !!page.isPage);
164}
165
166export async function crawl(opts: CrawlOptions): Promise<Page[]> {
167 const root = new Root(opts);
168 const pages = await root.crawlPages();
169 const graph = new WeightedDiGraph(pages.length);
170 if (pages.length <= 1) {
171 return pages;
172 }
173 for (let idx = 0; idx < pages.length; idx++) {
174 pages[idx]!._id = idx;
175 }
176 for (let i = 0; i < pages.length; i++) {
177 const from = pages[i]!;
178 const links = await from.getLinks();
179 if (!links) continue;
180 for (let j = 0; j < links.length; j++) {
181 const to = links[j]!;
182 if (to._id != null && to.isPage) {
183 const weight = links.length - j;
184 const edge = new Edge(i, to._id!, weight);
185 graph.addEdge(edge);
186 }
187 }
188 }
189 const kruskal = new KruskalMST(graph);
190 const output = new Set<Page>([root.origin]);
191 for (const edge of kruskal.mst) {
192 const page = pages[edge.from()];
193 if (page) output.add(page);
194 }
195 return [...output];
196}