src/page.ts at main · kitten.sh/rn-llms-txt-gen

this repo has no description
rn-llms-txt-gen / src / page.ts
at main 5.6 kB view raw
  1import { debug } from 'debug';
  2import * as fs from 'node:fs/promises';
  3import * as path from 'node:path';
  4import { URLPattern } from 'urlpattern-polyfill/urlpattern';
  5import { WeightedDiGraph, KruskalMST, Edge } from 'js-graph-algorithms';
  6import { extractContent, extractLinks, parseBody } from "./dom";
  7import { fetchHtml } from "./fetch";
  8import { htmlToMarkdown, sanitizeHtml, transferTitle } from "./unified";
  9import { rewriteMarkdown } from './rewrite';
 10import { makeCacheFileHelper } from './path';
 11
 12const log = debug('llms-txt-gen:graph');
 13
 14export interface CrawlOptions {
 15  baseURL: URL | string;
 16  include?: string[];
 17  exclude?: string[];
 18}
 19
 20const cacheDir = path.join(process.cwd(), '.cache/page');
 21await fs.mkdir(cacheDir, { recursive: true });
 22const getCacheFile = makeCacheFileHelper(cacheDir, '.md');
 23
 24async function extractContentToMarkdown(url: URL, html: string): Promise<string | null> {
 25  const cacheFile = await getCacheFile(url);
 26  try {
 27    const content = await fs.readFile(cacheFile, 'utf-8');
 28    if (content) {
 29      log('extracted output from cache', url.pathname);
 30      return content;
 31    }
 32  } catch {}
 33  log('extracting content', url.pathname);
 34  const doc = parseBody(url, html);
 35  const output = extractContent(doc);
 36  if (output) {
 37    const markdown = await htmlToMarkdown(output);
 38    await fs.writeFile(cacheFile, markdown, 'utf-8');
 39    return markdown;
 40  } else {
 41    return null;
 42  }
 43}
 44
 45class Root {
 46  baseURL: URL;
 47  #include: URLPattern[];
 48  #exclude: URLPattern[];
 49  #pages: Map<string, Page> = new Map();
 50  #origin: Page;
 51
 52  constructor(opts: CrawlOptions) {
 53    const baseURL = new URL(opts.baseURL);
 54    const baseURLStr = baseURL.toString();
 55    this.baseURL = baseURL;
 56    this.#origin = new Page(this, baseURL);
 57    this.#pages.set(opts.baseURL.toString(), this.#origin);
 58    const toPattern = (pattern: string) => new URLPattern(pattern, baseURLStr);
 59    this.#include = opts.include?.map(toPattern) ?? [];
 60    this.#exclude = opts.exclude?.map(toPattern) ?? [];
 61  }
 62
 63  isURLAllowed(url: URL): boolean {
 64    if (url.origin !== this.baseURL.origin)
 65      return false;
 66    if (this.#include.length) {
 67      if (this.#include.some((pattern) => pattern.test(url))) {
 68        return !this.#exclude.some((pattern) => pattern.test(url));
 69      } else {
 70        return false;
 71      }
 72    } else if (this.#exclude.length) {
 73      return !this.#exclude.some((pattern) => pattern.test(url));
 74    } else {
 75      return true;
 76    }
 77  }
 78
 79  get origin() {
 80    return this.#origin;
 81  }
 82
 83  getPage(url: URL) {
 84    let page = this.#pages.get(url.toString());
 85    if (!page) {
 86      page = new Page(this, url);
 87      this.#pages.set(url.toString(), page);
 88    }
 89    return page;
 90  }
 91
 92  async crawlPages(): Promise<Page[]> {
 93    return await crawlPages(this.#origin);
 94  }
 95}
 96
 97class Page {
 98  root: Root;
 99  url: URL;
100  isPage = true;
101
102  #html: string | null = null;
103  #content: string | null = null;
104  #links: Page[] | null = null;
105
106  _id?: number;
107
108  constructor(root: Root, url: URL) {
109    this.root = root;
110    this.url = url;
111  }
112
113  async getHTML() {
114    if (this.#html !== null || !this.isPage)
115      return this.#html;
116    const content = await fetchHtml(this.url)
117    if (!content) {
118      this.isPage = false;
119      return (this.#html = null);
120    } else {
121      const sanitized = await sanitizeHtml(content);
122      return (this.#html = sanitized);
123    }
124  }
125
126  async getLinks() {
127    if (this.#links !== null || !this.isPage)
128      return this.#links;
129    const html = await this.getHTML();
130    if (!html) return [];
131    const doc = parseBody(this.url, html);
132    const urls = extractLinks(doc);
133    return urls
134      .filter((url) => this.root.isURLAllowed(url))
135      .map((url) => this.root.getPage(url));
136  }
137
138  async getContent(): Promise<string | null> {
139    if (this.#content !== null || !this.isPage)
140      return this.#content;
141    const html = await this.getHTML();
142    if (!html) return (this.#content = null);
143    const markdown = await extractContentToMarkdown(this.url, html);
144    if (!markdown) return (this.#content = null);
145    const rewritten = await rewriteMarkdown(this.url, markdown);
146    return (this.#content = await transferTitle(markdown, rewritten));
147  }
148}
149
150async function crawlPages(page: Page, visited = new Set<Page>([page]), depth = 1): Promise<Page[]> {
151  const links = await page.getLinks();
152  if (links) {
153    log(`crawling (${links.length} links, depth = ${depth})`, page.url.pathname);
154    for (const link of links) {
155      if (visited.has(link)) {
156        continue;
157      } else {
158        visited.add(page);
159        await crawlPages(link, visited, depth + 1);
160      }
161    }
162  }
163  return [...visited].filter((page) => !!page.isPage);
164}
165
166export async function crawl(opts: CrawlOptions): Promise<Page[]> {
167  const root = new Root(opts);
168  const pages = await root.crawlPages();
169  const graph = new WeightedDiGraph(pages.length);
170  if (pages.length <= 1) {
171    return pages;
172  }
173  for (let idx = 0; idx < pages.length; idx++) {
174    pages[idx]!._id = idx;
175  }
176  for (let i = 0; i < pages.length; i++) {
177    const from = pages[i]!;
178    const links = await from.getLinks();
179    if (!links) continue;
180    for (let j = 0; j < links.length; j++) {
181      const to = links[j]!;
182      if (to._id != null && to.isPage) {
183        const weight = links.length - j;
184        const edge = new Edge(i, to._id!, weight);
185        graph.addEdge(edge);
186      }
187    }
188  }
189  const kruskal = new KruskalMST(graph);
190  const output = new Set<Page>([root.origin]);
191  for (const edge of kruskal.mst) {
192    const page = pages[edge.from()];
193    if (page) output.add(page);
194  }
195  return [...output];
196}