this repo has no description

Add prompt rewrite for markdown sections

+2
.env.example
···
+
OPENAI_API_KEY=
+
OPENAI_API_URL=https://api.cloudflare.com/client/v4/accounts/ID/ai/v1
+50
bun.lock
···
"": {
"name": "llms-txt-gen",
"dependencies": {
+
"@ai-sdk/openai": "^1.3.22",
"@mozilla/readability": "^0.6.0",
"@tsconfig/bun": "^1.0.8",
+
"ai": "^4.3.16",
+
"ai-fallback": "^0.1.5",
"debug": "^4.4.1",
"happy-dom": "^18.0.1",
"hast-util-sanitize": "^5.0.2",
"js-graph-algorithms": "^1.0.18",
"mdast": "^3.0.0",
+
"ollama-ai-provider": "^1.2.0",
"prettier": "^3.5.3",
"rehype-parse": "^9.0.1",
"rehype-remark": "^10.0.1",
···
},
},
"packages": {
+
"@ai-sdk/openai": ["@ai-sdk/openai@1.3.22", "", { "dependencies": { "@ai-sdk/provider": "1.1.3", "@ai-sdk/provider-utils": "2.2.8" }, "peerDependencies": { "zod": "^3.0.0" } }, "sha512-QwA+2EkG0QyjVR+7h6FE7iOu2ivNqAVMm9UJZkVxxTk5OIq5fFJDTEI/zICEMuHImTTXR2JjsL6EirJ28Jc4cw=="],
+
+
"@ai-sdk/provider": ["@ai-sdk/provider@1.1.3", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-qZMxYJ0qqX/RfnuIaab+zp8UAeJn/ygXXAffR5I4N0n1IrvA6qBsjc8hXLmBiMV2zoXlifkacF7sEFnYnjBcqg=="],
+
+
"@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@2.2.8", "", { "dependencies": { "@ai-sdk/provider": "1.1.3", "nanoid": "^3.3.8", "secure-json-parse": "^2.7.0" }, "peerDependencies": { "zod": "^3.23.8" } }, "sha512-fqhG+4sCVv8x7nFzYnFo19ryhAa3w096Kmc3hWxMQfW/TubPOmt3A6tYZhl4mUfQWWQMsuSkLrtjlWuXBVSGQA=="],
+
+
"@ai-sdk/react": ["@ai-sdk/react@1.2.12", "", { "dependencies": { "@ai-sdk/provider-utils": "2.2.8", "@ai-sdk/ui-utils": "1.2.11", "swr": "^2.2.5", "throttleit": "2.1.0" }, "peerDependencies": { "react": "^18 || ^19 || ^19.0.0-rc", "zod": "^3.23.8" }, "optionalPeers": ["zod"] }, "sha512-jK1IZZ22evPZoQW3vlkZ7wvjYGYF+tRBKXtrcolduIkQ/m/sOAVcVeVDUDvh1T91xCnWCdUGCPZg2avZ90mv3g=="],
+
+
"@ai-sdk/ui-utils": ["@ai-sdk/ui-utils@1.2.11", "", { "dependencies": { "@ai-sdk/provider": "1.1.3", "@ai-sdk/provider-utils": "2.2.8", "zod-to-json-schema": "^3.24.1" }, "peerDependencies": { "zod": "^3.23.8" } }, "sha512-3zcwCc8ezzFlwp3ZD15wAPjf2Au4s3vAbKsXQVyhxODHcmu0iyPO2Eua6D/vicq/AUm/BAo60r97O6HU+EI0+w=="],
+
"@mozilla/readability": ["@mozilla/readability@0.6.0", "", {}, "sha512-juG5VWh4qAivzTAeMzvY9xs9HY5rAcr2E4I7tiSSCokRFi7XIZCAu92ZkSTsIj1OPceCifL3cpfteP3pDT9/QQ=="],
+
+
"@opentelemetry/api": ["@opentelemetry/api@1.9.0", "", {}, "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg=="],
"@tsconfig/bun": ["@tsconfig/bun@1.0.8", "", {}, "sha512-JlJaRaS4hBTypxtFe8WhnwV8blf0R+3yehLk8XuyxUYNx6VXsKCjACSCvOYEFUiqlhlBWxtYCn/zRlOb8BzBQg=="],
"@types/debug": ["@types/debug@4.1.12", "", { "dependencies": { "@types/ms": "*" } }, "sha512-vIChWdVG3LG1SMxEvI/AK+FWJthlrqlTu7fbrlywTkkaONwk/UAGaULXRlf8vkzFBLVm0zkMdCquhL5aOjhXPQ=="],
+
+
"@types/diff-match-patch": ["@types/diff-match-patch@1.0.36", "", {}, "sha512-xFdR6tkm0MWvBfO8xXCSsinYxHcqkQUlcHeSpMC2ukzOb6lwQAfDmW+Qt0AvlGd8HpsS28qKsB+oPeJn9I39jg=="],
"@types/hast": ["@types/hast@3.0.4", "", { "dependencies": { "@types/unist": "*" } }, "sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ=="],
···
"@ungap/structured-clone": ["@ungap/structured-clone@1.3.0", "", {}, "sha512-WmoN8qaIAo7WTYWbAZuG8PYEhn5fkz7dZrqTBZ7dtt//lL2Gwms1IcnQ5yHqjDfX8Ft5j4YzDM23f87zBfDe9g=="],
+
"ai": ["ai@4.3.16", "", { "dependencies": { "@ai-sdk/provider": "1.1.3", "@ai-sdk/provider-utils": "2.2.8", "@ai-sdk/react": "1.2.12", "@ai-sdk/ui-utils": "1.2.11", "@opentelemetry/api": "1.9.0", "jsondiffpatch": "0.6.0" }, "peerDependencies": { "react": "^18 || ^19 || ^19.0.0-rc", "zod": "^3.23.8" }, "optionalPeers": ["react"] }, "sha512-KUDwlThJ5tr2Vw0A1ZkbDKNME3wzWhuVfAOwIvFUzl1TPVDFAXDFTXio3p+jaKneB+dKNCvFFlolYmmgHttG1g=="],
+
+
"ai-fallback": ["ai-fallback@0.1.5", "", { "dependencies": { "@ai-sdk/provider": "^1", "@ai-sdk/provider-utils": "^2" } }, "sha512-/FhTd9SGMEUDYBKbO3ZyfS0CBGglJByMbMRQOGjjDYlxZinFZtn99w1SPh4NZYJWIP5jjoewytfZjp+30QPT1A=="],
+
"bail": ["bail@2.0.2", "", {}, "sha512-0xO6mYd7JB2YesxDKplafRpsiOzPt9V02ddPCLbY1xYGPOX24NTyN50qnUxgCPcSoYMhKpAuBTjQoRZCAkUDRw=="],
"ccount": ["ccount@2.0.1", "", {}, "sha512-eyrF0jiFpY+3drT6383f1qhkbGsLSifNAjA61IUjZjmLCWjItY6LB9ft9YhoDgwfmclB2zhu51Lc7+95b8NRAg=="],
+
+
"chalk": ["chalk@5.4.1", "", {}, "sha512-zgVZuo2WcZgfUEmsn6eO3kINexW8RAE4maiQ8QNs8CtpPCSyMiYsULR3HQYkm3w8FIA3SberyMJMSldGsW+U3w=="],
"character-entities": ["character-entities@2.0.2", "", {}, "sha512-shx7oQ0Awen/BRIdkjkvz54PnEEI/EjwXDSIZp86/KKdbafHh1Df/RYGBhn4hbe2+uKC9FnT5UCEdyPz3ai9hQ=="],
···
"devlop": ["devlop@1.1.0", "", { "dependencies": { "dequal": "^2.0.0" } }, "sha512-RWmIqhcFf1lRYBvNmr7qTNuyCt/7/ns2jbpp1+PalgE/rDQcBT0fioSMUpJ93irlUhC5hrg4cYqe6U+0ImW0rA=="],
+
"diff-match-patch": ["diff-match-patch@1.0.5", "", {}, "sha512-IayShXAgj/QMXgB0IWmKx+rOPuGMhqm5w6jvFxmVenXKIzRqTAAsbBPT3kWQeGANj3jGgvcvv4yK6SxqYmikgw=="],
+
"entities": ["entities@6.0.1", "", {}, "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g=="],
"escape-string-regexp": ["escape-string-regexp@5.0.0", "", {}, "sha512-/veY75JbMK4j1yjvuUxuVsiS/hr/4iHs9FTT6cgTexxdE0Ly/glccBAkloH/DofkjRbZU3bnoj38mOmhkZ0lHw=="],
···
"is-plain-obj": ["is-plain-obj@4.1.0", "", {}, "sha512-+Pgi+vMuUNkJyExiMBt5IlFoMyKnr5zhJ4Uspz58WOhBF5QoIZkFyNHIbBAtHwzVAgk5RtndVNsDRN61/mmDqg=="],
"js-graph-algorithms": ["js-graph-algorithms@1.0.18", "", { "bin": { "js-graphs": "./src/jsgraphs.js" } }, "sha512-Gu1wtWzXBzGeye/j9BuyplGHscwqKRZodp/0M1vyBc19RJpblSwKGu099KwwaTx9cRIV+Qupk8xUMfEiGfFqSA=="],
+
+
"json-schema": ["json-schema@0.4.0", "", {}, "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA=="],
+
+
"jsondiffpatch": ["jsondiffpatch@0.6.0", "", { "dependencies": { "@types/diff-match-patch": "^1.0.36", "chalk": "^5.3.0", "diff-match-patch": "^1.0.5" }, "bin": { "jsondiffpatch": "bin/jsondiffpatch.js" } }, "sha512-3QItJOXp2AP1uv7waBkao5nCvhEv+QmJAd38Ybq7wNI74Q+BBmnLn4EDKz6yI9xGAIQoUF87qHt+kc1IVxB4zQ=="],
"longest-streak": ["longest-streak@3.1.0", "", {}, "sha512-9Ri+o0JYgehTaVBBDoMqIl8GXtbWg711O3srftcHhZ0dqnETqLaoIK0x17fUw9rFSlK/0NlsKe0Ahhyl5pXE2g=="],
···
"ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="],
+
"nanoid": ["nanoid@3.3.11", "", { "bin": { "nanoid": "bin/nanoid.cjs" } }, "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w=="],
+
+
"ollama-ai-provider": ["ollama-ai-provider@1.2.0", "", { "dependencies": { "@ai-sdk/provider": "^1.0.0", "@ai-sdk/provider-utils": "^2.0.0", "partial-json": "0.1.7" }, "peerDependencies": { "zod": "^3.0.0" }, "optionalPeers": ["zod"] }, "sha512-jTNFruwe3O/ruJeppI/quoOUxG7NA6blG3ZyQj3lei4+NnJo7bi3eIRWqlVpRlu/mbzbFXeJSBuYQWF6pzGKww=="],
+
"parse5": ["parse5@7.3.0", "", { "dependencies": { "entities": "^6.0.0" } }, "sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw=="],
+
"partial-json": ["partial-json@0.1.7", "", {}, "sha512-Njv/59hHaokb/hRUjce3Hdv12wd60MtM9Z5Olmn+nehe0QDAsRtRbJPvJ0Z91TusF0SuZRIvnM+S4l6EIP8leA=="],
+
"prettier": ["prettier@3.5.3", "", { "bin": { "prettier": "bin/prettier.cjs" } }, "sha512-QQtaxnoDJeAkDvDKWCLiwIXkTgRhwYDEQCghU9Z6q03iyek/rxRh/2lC3HB7P8sWT2xC/y5JDctPLBIGzHKbhw=="],
"property-information": ["property-information@7.1.0", "", {}, "sha512-TwEZ+X+yCJmYfL7TPUOcvBZ4QfoT5YenQiJuX//0th53DE6w0xxLEtfK3iyryQFddXuvkIk51EEgrJQ0WJkOmQ=="],
+
+
"react": ["react@19.1.0", "", {}, "sha512-FS+XFBNvn3GTAWq26joslQgWNoFu08F4kl0J4CgdNKADkdSGXQyTCnKteIAJy96Br6YbpEU1LSzV5dYtjMkMDg=="],
"rehype-minify-whitespace": ["rehype-minify-whitespace@6.0.2", "", { "dependencies": { "@types/hast": "^3.0.0", "hast-util-minify-whitespace": "^1.0.0" } }, "sha512-Zk0pyQ06A3Lyxhe9vGtOtzz3Z0+qZ5+7icZ/PL/2x1SHPbKao5oB/g/rlc6BCTajqBb33JcOe71Ye1oFsuYbnw=="],
···
"remark-unlink": ["remark-unlink@5.0.0", "", { "dependencies": { "@types/mdast": "^4.0.0", "mdast-squeeze-paragraphs": "^6.0.0", "unist-util-visit": "^5.0.0" } }, "sha512-8NFrI3SecxhOLb734tKaxcU//lNDABabz1I26MGjdlpkUg1I+Fr7lyqL9ckxaCB4kErXD10mScPD7yhCXX4Pfw=="],
+
"secure-json-parse": ["secure-json-parse@2.7.0", "", {}, "sha512-6aU+Rwsezw7VR8/nyvKTx8QpWH9FrcYiXXlqC4z5d5XQBDRqtbfsRjnwGyqbi3gddNtWHuEk9OANUotL26qKUw=="],
+
"space-separated-tokens": ["space-separated-tokens@2.0.2", "", {}, "sha512-PEGlAwrG8yXGXRjW32fGbg66JAlOAwbObuqVoJpv/mRgoWDQfgH1wDPvtzWyUSNAXBGSk8h755YDbbcEy3SH2Q=="],
"stringify-entities": ["stringify-entities@4.0.4", "", { "dependencies": { "character-entities-html4": "^2.0.0", "character-entities-legacy": "^3.0.0" } }, "sha512-IwfBptatlO+QCJUo19AqvrPNqlVMpW9YEL2LIVY+Rpv2qsjCGxaDLNRgeGsQWJhfItebuJhsGSLjaBbNSQ+ieg=="],
+
+
"swr": ["swr@2.3.3", "", { "dependencies": { "dequal": "^2.0.3", "use-sync-external-store": "^1.4.0" }, "peerDependencies": { "react": "^16.11.0 || ^17.0.0 || ^18.0.0 || ^19.0.0" } }, "sha512-dshNvs3ExOqtZ6kJBaAsabhPdHyeY4P2cKwRCniDVifBMoG/SVI7tfLWqPXriVspf2Rg4tPzXJTnwaihIeFw2A=="],
+
+
"throttleit": ["throttleit@2.1.0", "", {}, "sha512-nt6AMGKW1p/70DF/hGBdJB57B8Tspmbp5gfJ8ilhLnt7kkr2ye7hzD6NVG8GGErk2HWF34igrL2CXmNIkzKqKw=="],
"trim-lines": ["trim-lines@3.0.1", "", {}, "sha512-kRj8B+YHZCc9kQYdWfJB2/oUl9rA99qbowYYBtr4ui4mZyAQ2JpvVBd/6U2YloATfqBhBTSMhTpgBHtU0Mf3Rg=="],
···
"urlpattern-polyfill": ["urlpattern-polyfill@10.1.0", "", {}, "sha512-IGjKp/o0NL3Bso1PymYURCJxMPNAf/ILOpendP9f5B6e1rTJgdgiOvgfoT8VxCAdY+Wisb9uhGaJJf3yZ2V9nw=="],
+
"use-sync-external-store": ["use-sync-external-store@1.5.0", "", { "peerDependencies": { "react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0" } }, "sha512-Rb46I4cGGVBmjamjphe8L/UnvJD+uPPtTkNvX5mZgqdbavhI4EbgIWJiIHXJ8bc/i9EQGPRh4DwEURJ552Do0A=="],
+
"vfile": ["vfile@6.0.3", "", { "dependencies": { "@types/unist": "^3.0.0", "vfile-message": "^4.0.0" } }, "sha512-KzIbH/9tXat2u30jf+smMwFCsno4wHVdNmzFyL+T/L3UGqqk6JKfVqOFOZEpZSHADH1k40ab6NUIXZq422ov3Q=="],
"vfile-location": ["vfile-location@5.0.3", "", { "dependencies": { "@types/unist": "^3.0.0", "vfile": "^6.0.0" } }, "sha512-5yXvWDEgqeiYiBe1lbxYF7UMAIm/IcopxMHrMQDq3nvKcjPKIhZklUKL+AE7J7uApI4kwe2snsK+eI6UTj9EHg=="],
···
"web-namespaces": ["web-namespaces@2.0.1", "", {}, "sha512-bKr1DkiNa2krS7qxNtdrtHAmzuYGFQLiQ13TsorsdT6ULTkPLKuu5+GsFpDlg6JFjUTwX2DyhMPG2be8uPrqsQ=="],
"whatwg-mimetype": ["whatwg-mimetype@3.0.0", "", {}, "sha512-nt+N2dzIutVRxARx1nghPKGv1xHikU7HKdfafKkLNLindmPU/ch3U31NOCGGA/dmPcmb1VlofO0vnKAcsm0o/Q=="],
+
+
"zod": ["zod@3.25.61", "", {}, "sha512-fzfJgUw78LTNnHujj9re1Ov/JJQkRZZGDMcYqSx7Hp4rPOkKywaFHq0S6GoHeXs0wGNE/sIOutkXgnwzrVOGCQ=="],
+
+
"zod-to-json-schema": ["zod-to-json-schema@3.24.5", "", { "peerDependencies": { "zod": "^3.24.1" } }, "sha512-/AuWwMP+YqiPbsJx5D6TfgRTc4kTLjsh5SOcd4bLsfUg2RcEXrFMJl1DGgdHy2aCfsIA/cr/1JM0xcB2GZji8g=="],
"zwitch": ["zwitch@2.0.4", "", {}, "sha512-bXE4cR/kVZhKZX/RjPEflHaKVhUVl85noU3v6b8apfQEc1x4A+zBxjZ4lN8LqGd6WZ3dl98pY4o717VFmoPp+A=="],
}
+4
package.json
···
"start": "DEBUG=* bun src/index.ts"
},
"dependencies": {
+
"@ai-sdk/openai": "^1.3.22",
"@mozilla/readability": "^0.6.0",
"@tsconfig/bun": "^1.0.8",
+
"ai": "^4.3.16",
+
"ai-fallback": "^0.1.5",
"debug": "^4.4.1",
"happy-dom": "^18.0.1",
"hast-util-sanitize": "^5.0.2",
"js-graph-algorithms": "^1.0.18",
"mdast": "^3.0.0",
+
"ollama-ai-provider": "^1.2.0",
"prettier": "^3.5.3",
"rehype-parse": "^9.0.1",
"rehype-remark": "^10.0.1",
+3 -10
src/fetch.ts
···
import { debug } from 'debug';
import * as fs from 'node:fs/promises';
import * as path from 'node:path';
+
import { makeCacheFileHelper } from './path';
const log = debug('llms-txt-gen.fetch');
-
const cacheDir = path.join(process.cwd(), '.cache');
+
const cacheDir = path.join(process.cwd(), '.cache/fetch');
await fs.mkdir(cacheDir, { recursive: true });
-
-
const getCacheFile = async (url: URL) => {
-
const { hostname, pathname } = url;
-
const name = pathname.split('/').filter(Boolean).join('_');
-
const targetDir = path.join(cacheDir, hostname);
-
const basename = path.basename(name, path.extname(name));
-
await fs.mkdir(targetDir, { recursive: true });
-
return path.join(targetDir, `${basename}.html`);
-
};
+
const getCacheFile = makeCacheFileHelper(cacheDir);
export async function fetchHtml(url: URL): Promise<string | null> {
const cacheFile = await getCacheFile(url);
+7 -4
src/index.ts
···
async function generate(site: Site) {
log('crawl', site.name);
const pages = await crawl(site);
-
const contents = await concatMarkdown(
-
pages.map((page) => page.getContent())
-
);
-
const formatted = await formatMarkdown(contents);
+
const contents: string[] = [];
+
for (const page of pages) {
+
const content = await page.getContent();
+
if (content) contents.push(content);
+
}
+
const output = await concatMarkdown(contents);
+
const formatted = await formatMarkdown(output);
const file = path.join(output, `llms-full-${site.name}.txt`);
await fs.writeFile(file, formatted, 'utf-8');
}
+34 -8
src/page.ts
···
import { debug } from 'debug';
+
import * as fs from 'node:fs/promises';
+
import * as path from 'node:path';
import { URLPattern } from 'urlpattern-polyfill/urlpattern';
import { WeightedDiGraph, KruskalMST, Edge } from 'js-graph-algorithms';
import { extractContent, extractLinks, parseBody } from "./dom";
import { fetchHtml } from "./fetch";
import { htmlToMarkdown, sanitizeHtml } from "./unified";
+
import { rewriteMarkdown } from './rewrite';
+
import { makeCacheFileHelper } from './path';
const log = debug('llms-txt-gen.graph');
···
baseURL: URL | string;
include?: string[];
exclude?: string[];
+
}
+
+
const cacheDir = path.join(process.cwd(), '.cache/page');
+
await fs.mkdir(cacheDir, { recursive: true });
+
const getCacheFile = makeCacheFileHelper(cacheDir, '.md');
+
+
async function extractContentToMarkdown(url: URL, html: string): Promise<string | null> {
+
const cacheFile = await getCacheFile(url);
+
try {
+
const content = await fs.readFile(cacheFile, 'utf-8');
+
if (content) {
+
log('extracted output from cache', url.pathname);
+
return content;
+
}
+
} catch {}
+
log('extracting content', url.pathname);
+
const doc = parseBody(url, html);
+
const output = extractContent(doc);
+
if (output) {
+
const markdown = await htmlToMarkdown(output);
+
await fs.writeFile(cacheFile, markdown, 'utf-8');
+
return markdown;
+
} else {
+
return null;
+
}
}
class Root {
···
if (this.#content !== null || !this.isPage)
return this.#content;
const html = await this.getHTML();
-
if (!html) return null;
-
const doc = parseBody(this.url, html);
-
const content = extractContent(doc);
-
if (content) {
-
return (this.#content = await htmlToMarkdown(content));
-
} else {
-
return (this.#content = null);
-
}
+
if (!html) return (this.#content = null);
+
const markdown = await extractContentToMarkdown(this.url, html);
+
if (!markdown) return (this.#content = null);
+
const rewritten = await rewriteMarkdown(this.url, markdown);
+
return (this.#content = rewritten);
}
}
+11
src/path.ts
···
+
import * as fs from 'node:fs/promises';
+
import * as path from 'node:path';
+
+
export const makeCacheFileHelper = (baseDir: string, ext = '.html') => async (url: URL) => {
+
const { hostname, pathname } = url;
+
const name = pathname.split('/').filter(Boolean).join('_');
+
const targetDir = path.join(baseDir, hostname);
+
const basename = path.basename(name, path.extname(name));
+
await fs.mkdir(targetDir, { recursive: true });
+
return path.join(targetDir, `${basename}${ext}`);
+
};
+66
src/rewrite.ts
···
+
import { debug } from 'debug';
+
import { createFallback } from 'ai-fallback';
+
import { generateText } from 'ai';
+
import { createOpenAI } from '@ai-sdk/openai';
+
import { createOllama } from 'ollama-ai-provider';
+
import * as fs from 'node:fs/promises';
+
import * as path from 'node:path';
+
+
import { makeCacheFileHelper } from './path';
+
+
const log = debug('llms-txt-gen.rewrite');
+
+
const cacheDir = path.join(process.cwd(), '.cache/rewrite');
+
await fs.mkdir(cacheDir, { recursive: true });
+
const getCacheFile = makeCacheFileHelper(cacheDir, '.txt');
+
+
if (!process.env.OPENAI_API_KEY) throw new Error('Missing OPENAI_API_KEY env var');
+
if (!process.env.OPENAI_API_URL) throw new Error('Missing OPENAI_API_URL env var');
+
+
const SYSTEM_PROMPT = `
+
Reformat markdown content you're given into an llms-full.txt file, also in markdown format
+
- Where the format isn't easily understandable by AI, reformat it faithfully to make it processable
+
- Reformat for an AI and paraphrase where necessary, but don't add interpretations
+
- Preserve code snippets and keep them in TypeScript or TypeScript typings format
+
- Avoid using emphasis or excessive markdown syntax, but keep code snippets where they are
+
- Don't mention other content, pages, or external content (Remove sentences such as "Refer to", "Read more")
+
- When encountering a markdown table, ensure that you don't output a separate legend, and keep all relevant information in the table
+
- Don't use any knowledge you may have on the subject. Only output what you're given.
+
`;
+
+
const ai = createOpenAI({
+
apiKey: process.env.OPENAI_API_KEY,
+
baseURL: process.env.OPENAI_API_URL,
+
});
+
+
const ollama = createOllama({
+
baseURL: 'http://localhost:11434/api',
+
});
+
+
export async function rewriteMarkdown(url: URL, input: string) {
+
const cacheFile = await getCacheFile(url);
+
let content: string;
+
try {
+
content = await fs.readFile(cacheFile, 'utf-8');
+
if (content) {
+
log('prompt output from cache', url.pathname);
+
return content;
+
}
+
} catch {}
+
log('prompting to rewrite', url.pathname);
+
const { text } = await generateText({
+
model: createFallback({
+
models: [
+
ollama('gemma:7b'),
+
ai('@hf/google/gemma-7b-it'),
+
],
+
onError(error, modelId) {
+
log(`error using model ${modelId}`, error);
+
},
+
}),
+
system: SYSTEM_PROMPT.trim(),
+
prompt: input,
+
});
+
await fs.writeFile(cacheFile, text, 'utf-8');
+
return text;
+
}
+11 -2
src/unified.ts
···
parent.children.splice(index, 1);
if (node.children.length > 1 || !child || child.type !== 'text')
return;
-
switch (child.value.trim()) {
+
const value = child.value.trim();
+
switch (value) {
case 'Example':
case 'Remarks':
case 'Note':
···
} else if (node.type === 'text') {
if (!parent || parent.type !== 'paragraph' || parent.children.length > 1)
return;
-
switch (node.value.trim()) {
+
const value = node.value.trim();
+
if (
+
value.startsWith('Last updated on ') ||
+
value.startsWith('Copyright ')
+
) {
+
parent.children.splice(index, 1);
+
return;
+
}
+
switch (value) {
case 'Loading...':
case 'Caution':
case 'tsx':