this repo has no description
at main 5.6 kB view raw
1import { debug } from 'debug'; 2import * as fs from 'node:fs/promises'; 3import * as path from 'node:path'; 4import { URLPattern } from 'urlpattern-polyfill/urlpattern'; 5import { WeightedDiGraph, KruskalMST, Edge } from 'js-graph-algorithms'; 6import { extractContent, extractLinks, parseBody } from "./dom"; 7import { fetchHtml } from "./fetch"; 8import { htmlToMarkdown, sanitizeHtml, transferTitle } from "./unified"; 9import { rewriteMarkdown } from './rewrite'; 10import { makeCacheFileHelper } from './path'; 11 12const log = debug('llms-txt-gen:graph'); 13 14export interface CrawlOptions { 15 baseURL: URL | string; 16 include?: string[]; 17 exclude?: string[]; 18} 19 20const cacheDir = path.join(process.cwd(), '.cache/page'); 21await fs.mkdir(cacheDir, { recursive: true }); 22const getCacheFile = makeCacheFileHelper(cacheDir, '.md'); 23 24async function extractContentToMarkdown(url: URL, html: string): Promise<string | null> { 25 const cacheFile = await getCacheFile(url); 26 try { 27 const content = await fs.readFile(cacheFile, 'utf-8'); 28 if (content) { 29 log('extracted output from cache', url.pathname); 30 return content; 31 } 32 } catch {} 33 log('extracting content', url.pathname); 34 const doc = parseBody(url, html); 35 const output = extractContent(doc); 36 if (output) { 37 const markdown = await htmlToMarkdown(output); 38 await fs.writeFile(cacheFile, markdown, 'utf-8'); 39 return markdown; 40 } else { 41 return null; 42 } 43} 44 45class Root { 46 baseURL: URL; 47 #include: URLPattern[]; 48 #exclude: URLPattern[]; 49 #pages: Map<string, Page> = new Map(); 50 #origin: Page; 51 52 constructor(opts: CrawlOptions) { 53 const baseURL = new URL(opts.baseURL); 54 const baseURLStr = baseURL.toString(); 55 this.baseURL = baseURL; 56 this.#origin = new Page(this, baseURL); 57 this.#pages.set(opts.baseURL.toString(), this.#origin); 58 const toPattern = (pattern: string) => new URLPattern(pattern, baseURLStr); 59 this.#include = opts.include?.map(toPattern) ?? []; 60 this.#exclude = opts.exclude?.map(toPattern) ?? []; 61 } 62 63 isURLAllowed(url: URL): boolean { 64 if (url.origin !== this.baseURL.origin) 65 return false; 66 if (this.#include.length) { 67 if (this.#include.some((pattern) => pattern.test(url))) { 68 return !this.#exclude.some((pattern) => pattern.test(url)); 69 } else { 70 return false; 71 } 72 } else if (this.#exclude.length) { 73 return !this.#exclude.some((pattern) => pattern.test(url)); 74 } else { 75 return true; 76 } 77 } 78 79 get origin() { 80 return this.#origin; 81 } 82 83 getPage(url: URL) { 84 let page = this.#pages.get(url.toString()); 85 if (!page) { 86 page = new Page(this, url); 87 this.#pages.set(url.toString(), page); 88 } 89 return page; 90 } 91 92 async crawlPages(): Promise<Page[]> { 93 return await crawlPages(this.#origin); 94 } 95} 96 97class Page { 98 root: Root; 99 url: URL; 100 isPage = true; 101 102 #html: string | null = null; 103 #content: string | null = null; 104 #links: Page[] | null = null; 105 106 _id?: number; 107 108 constructor(root: Root, url: URL) { 109 this.root = root; 110 this.url = url; 111 } 112 113 async getHTML() { 114 if (this.#html !== null || !this.isPage) 115 return this.#html; 116 const content = await fetchHtml(this.url) 117 if (!content) { 118 this.isPage = false; 119 return (this.#html = null); 120 } else { 121 const sanitized = await sanitizeHtml(content); 122 return (this.#html = sanitized); 123 } 124 } 125 126 async getLinks() { 127 if (this.#links !== null || !this.isPage) 128 return this.#links; 129 const html = await this.getHTML(); 130 if (!html) return []; 131 const doc = parseBody(this.url, html); 132 const urls = extractLinks(doc); 133 return urls 134 .filter((url) => this.root.isURLAllowed(url)) 135 .map((url) => this.root.getPage(url)); 136 } 137 138 async getContent(): Promise<string | null> { 139 if (this.#content !== null || !this.isPage) 140 return this.#content; 141 const html = await this.getHTML(); 142 if (!html) return (this.#content = null); 143 const markdown = await extractContentToMarkdown(this.url, html); 144 if (!markdown) return (this.#content = null); 145 const rewritten = await rewriteMarkdown(this.url, markdown); 146 return (this.#content = await transferTitle(markdown, rewritten)); 147 } 148} 149 150async function crawlPages(page: Page, visited = new Set<Page>([page]), depth = 1): Promise<Page[]> { 151 const links = await page.getLinks(); 152 if (links) { 153 log(`crawling (${links.length} links, depth = ${depth})`, page.url.pathname); 154 for (const link of links) { 155 if (visited.has(link)) { 156 continue; 157 } else { 158 visited.add(page); 159 await crawlPages(link, visited, depth + 1); 160 } 161 } 162 } 163 return [...visited].filter((page) => !!page.isPage); 164} 165 166export async function crawl(opts: CrawlOptions): Promise<Page[]> { 167 const root = new Root(opts); 168 const pages = await root.crawlPages(); 169 const graph = new WeightedDiGraph(pages.length); 170 if (pages.length <= 1) { 171 return pages; 172 } 173 for (let idx = 0; idx < pages.length; idx++) { 174 pages[idx]!._id = idx; 175 } 176 for (let i = 0; i < pages.length; i++) { 177 const from = pages[i]!; 178 const links = await from.getLinks(); 179 if (!links) continue; 180 for (let j = 0; j < links.length; j++) { 181 const to = links[j]!; 182 if (to._id != null && to.isPage) { 183 const weight = links.length - j; 184 const edge = new Edge(i, to._id!, weight); 185 graph.addEdge(edge); 186 } 187 } 188 } 189 const kruskal = new KruskalMST(graph); 190 const output = new Set<Page>([root.origin]); 191 for (const edge of kruskal.mst) { 192 const page = pages[edge.from()]; 193 if (page) output.add(page); 194 } 195 return [...output]; 196}