🪻 distributed transcription service thistle.dunkirk.sh

feat: use ai to break up paragraphs and rephrase stuff

dunkirk.sh ed81b190 2c25d3a2

verified
-1
src/lib/transcription.ts
···
import { ErrorCode } from "./errors";
import { saveTranscriptVTT } from "./transcript-storage";
import { cleanVTT } from "./vtt-cleaner";
-
import { parseVTT } from "./vtt-cleaner";
// Constants
export const MAX_FILE_SIZE = 100 * 1024 * 1024; // 100MB
+53 -52
src/lib/vtt-cleaner.test.ts
···
import { test, expect } from "bun:test";
-
import { cleanVTT } from "./vtt-cleaner";
-
import { readFileSync } from "fs";
-
import { join } from "path";
+
import { cleanVTT, parseVTT } from "./vtt-cleaner";
const sampleVTT = `WEBVTT
00:00:00.000 --> 00:00:03.480
<|startoftranscript|> [SIDE CONVERSATION]<|endoftext|>
-
00:00:00.000 --> 00:00:00.000
+
00:00:03.480 --> 00:00:05.000
<|startoftranscript|> Yes?
-
00:00:00.000 --> 00:00:00.000
+
00:00:05.000 --> 00:00:08.000
So with this course packet, what quiz is and exams, and if I can study through here, what you talk about?
-
00:00:00.000 --> 00:00:00.000
+
00:00:08.000 --> 00:00:10.000
And I give you a good review every time.
-
00:00:00.000 --> 00:00:00.000
+
00:00:10.000 --> 00:00:12.000
Yeah, so I'd be good to just study that and then we can do it.`;
-
test("cleanVTT removes tags and cleans text", async () => {
+
test("parseVTT extracts segments correctly", () => {
+
const segments = parseVTT(sampleVTT);
+
+
expect(segments.length).toBeGreaterThan(0);
+
expect(segments[0]?.timestamp).toContain("-->");
+
expect(segments[0]?.text).toBeDefined();
+
expect(segments[0]?.start).toBeGreaterThanOrEqual(0);
+
expect(segments[0]?.end).toBeGreaterThanOrEqual(0);
+
});
+
+
test("parseVTT handles empty VTT", () => {
+
const emptyVTT = "WEBVTT\n\n";
+
const segments = parseVTT(emptyVTT);
+
+
expect(segments.length).toBe(0);
+
});
+
+
test("cleanVTT preserves VTT format when AI key not available", async () => {
+
// Save original env var
+
const originalKey = process.env.LLM_API_KEY;
+
+
// Remove key to test fallback
+
delete process.env.LLM_API_KEY;
+
const result = await cleanVTT("test-vtt", sampleVTT);
expect(result).toContain("WEBVTT");
-
expect(result).not.toContain("[SIDE CONVERSATION]");
-
expect(result).not.toContain("<|startoftranscript|>");
-
expect(result).not.toContain("<|endoftext|>");
expect(result).toContain("-->");
-
-
console.log("Cleaned VTT preview:", result.substring(0, 200));
-
}, 30000);
+
+
// Restore original key
+
if (originalKey) {
+
process.env.LLM_API_KEY = originalKey;
+
}
+
});
test("cleanVTT preserves empty VTT", async () => {
const emptyVTT = "WEBVTT\n\n";
···
expect(result).toBe(emptyVTT);
});
-
test("cleanVTT detects multiple paragraphs", async () => {
-
const multiParaVTT = `WEBVTT
-
-
Paragraph 1-1
-
00:00:00.000 --> 00:00:00.000
-
Again, thank you for the privilege to not only study here, but also to teach here. Jesus,
-
-
Paragraph 1-2
-
00:00:00.000 --> 00:00:00.000
-
thank you. All`;
-
-
const result = await cleanVTT("test-multi-para", multiParaVTT);
-
-
expect(result).toContain("Paragraph 1-1");
-
expect(result).toContain("Paragraph 2-1");
-
// Should have at least two paragraphs
-
const paraMatches = result.match(/Paragraph \d+-\d+/g);
-
expect(paraMatches?.length).toBeGreaterThan(1);
-
}, 30000);
-
-
test("cleanVTT with real transcription data", async () => {
-
const originalApiKey = process.env.OPENROUTER_API_KEY;
-
// Temporarily unset to force fallback
-
delete process.env.OPENROUTER_API_KEY;
-
-
try {
-
const vttPath = join(__dirname, "../../transcripts/d69d8076-598a-4fe5-8100-fe3eff47fcd6.vtt");
-
const realVTT = readFileSync(vttPath, "utf-8");
+
// Integration test - only runs if API key is available
+
test("cleanVTT uses AI when available", async () => {
+
if (!process.env.LLM_API_KEY) {
+
console.log("Skipping AI test - no LLM_API_KEY set");
+
return;
+
}
-
const result = await cleanVTT("real-test", realVTT);
+
const result = await cleanVTT("test-ai", sampleVTT);
-
expect(result).toContain("WEBVTT");
-
// Check that it has multiple paragraph numbers
-
const paraMatches = result.match(/Paragraph (\d+)-\d+/g);
-
const uniqueParas = new Set(paraMatches?.map(m => m.match(/Paragraph (\d+)/)?.[1]));
-
expect(uniqueParas.size).toBeGreaterThan(1);
-
console.log("Paragraphs found:", uniqueParas.size);
-
} finally {
-
process.env.OPENROUTER_API_KEY = originalApiKey;
-
}
+
expect(result).toContain("WEBVTT");
+
expect(result).toContain("-->");
+
+
// AI should clean up tags
+
expect(result).not.toContain("<|startoftranscript|>");
+
expect(result).not.toContain("[SIDE CONVERSATION]");
+
+
// Should have paragraph formatting
+
expect(result).toContain("Paragraph");
+
+
console.log("AI-cleaned VTT preview:", result.substring(0, 300));
}, 30000);
+97 -105
src/lib/vtt-cleaner.ts
···
-
// Parse and clean VTT files
-
-
import type { ParagraphBoundary } from "./transcript-cleaner";
+
// Parse and clean VTT files using AI
interface VTTSegment {
-
index?: number;
+
index?: number | string;
timestamp: string;
text: string;
start?: number;
···
}
/**
-
* Clean VTT text segments by removing tags and fixing grammar.
-
* Additionally, merge cleaned segments into paragraph cues while preserving
-
* stable paragraph IDs (derived from first segment start time).
+
* Clean VTT text using AI to create paragraph-separated VTT file.
+
* Uses OpenRouter API to intelligently group segments into paragraphs
+
* while preserving timing information.
*/
export async function cleanVTT(
transcriptionId: string,
···
`[VTTCleaner] Processing ${segments.length} segments for ${transcriptionId}`,
);
-
// Combine all text for cleaning and paragraphing
-
const allText = segments.map((s) => s.text).join(" ");
-
-
// Attempt LLM-driven cleaning and paragraphing in one request, fallback to deterministic rules
-
let paragraphBoundaries: ParagraphBoundary[] = [];
+
const apiKey = process.env.LLM_API_KEY;
+
const apiBaseUrl = process.env.LLM_API_BASE_URL;
+
const model = process.env.LLM_MODEL;
+
+
if (!apiKey || !apiBaseUrl || !model) {
+
console.warn("[VTTCleaner] LLM configuration incomplete (need LLM_API_KEY, LLM_API_BASE_URL, LLM_MODEL), returning uncleaned VTT");
+
return vttContent;
+
}
try {
-
const { cleanAndGetParagraphBoundaries } = await import(
-
"./transcript-cleaner"
-
);
-
const result = await cleanAndGetParagraphBoundaries({
-
transcriptId: transcriptionId,
-
rawTranscript: allText,
-
segments: segments.map((s) => ({
-
index: s.index,
-
start: s.start,
-
end: s.end,
-
text: s.text,
-
})),
-
maxWordsMove: 0,
-
});
+
// Build the input for the AI
+
const inputSegments = segments.map((seg, idx) => ({
+
index: idx,
+
timestamp: seg.timestamp,
+
text: seg.text,
+
}));
+
+
const prompt = `Can you turn this into a paragraph separated vtt file?
+
+
Use the format "Paragraph X-Y" where X is the paragraph number and Y is the segment number within that paragraph:
+
+
Paragraph 1-1
+
00:00:00.000 --> 00:00:05.559
+
Today in chapel we are talking about the fact that we believe in having gospel
+
+
Paragraph 1-2
+
00:00:05.559 --> 00:00:08.639
+
conversations. I'm gonna run my own PowerPoint. I'm gonna jump around. It's
+
+
Paragraph 1-3
+
00:00:08.639 --> 00:00:11.960
+
gonna be a little more conversational than normal.
-
if (result?.paragraphs) {
-
paragraphBoundaries = result.paragraphs;
-
}
-
} catch (e) {
-
console.warn(
-
"[VTTCleaner] Consolidated LLM failed, no paragraph detection:",
-
e,
-
);
-
}
+
Paragraph 2-1
+
00:00:11.960 --> 00:00:15.000
+
Now let's talk about something different.
-
if (paragraphBoundaries.length === 0) {
-
// No paragraphs detected, treat as one big paragraph
-
paragraphBoundaries = [
-
{
-
startSegmentIndex: 0,
-
endSegmentIndex: segments.length - 1,
-
text: allText,
-
},
-
];
-
}
+
I want you to preserve sentences across paragraph breaks moving whatever is the smallest amount out to its own segment block.
-
// Get the full cleaned transcript from paragraphs
-
const cleanedTranscript = paragraphBoundaries.map((p) => p.text).join(" ");
+
Also go through and rewrite the words to extract the meaning and not necessarily the exact phrasing if it sounds unnatural when written. I want the text to remain lined up with the original though so don't rewrite entire paragraphs but you can remove ums, alrights, and similar. Also remove all contextual tags like [background noise]. Add punctuation if it's missing to make the text readable. If there is no more context to fit a segment then just skip it and move to the next one.
-
// Split cleaned text back into segments proportionally (word-based)
-
const words = cleanedTranscript.split(/\s+/).filter(Boolean);
-
const originalWords = allText.split(/\s+/).filter(Boolean);
-
const ratio = words.length / Math.max(1, originalWords.length);
+
Input segments:
+
${JSON.stringify(inputSegments, null, 2)}
-
let wordIndex = 0;
-
const cleanedSegments: VTTSegment[] = [];
+
Return ONLY the VTT content starting with "WEBVTT" and nothing else. No explanations or additional text.`;
-
for (const segment of segments) {
-
const originalWordCount = Math.max(
-
1,
-
segment.text.split(/\s+/).filter(Boolean).length,
+
const response = await fetch(
+
`${apiBaseUrl}/chat/completions`,
+
{
+
method: "POST",
+
headers: {
+
"Content-Type": "application/json",
+
"Authorization": `Bearer ${apiKey}`,
+
"HTTP-Referer": "https://thistle.app",
+
"X-Title": "Thistle Transcription",
+
},
+
body: JSON.stringify({
+
model,
+
messages: [
+
{ role: "user", content: prompt },
+
],
+
temperature: 0.3,
+
max_tokens: 16384,
+
}),
+
},
);
-
const newWordCount = Math.max(1, Math.round(originalWordCount * ratio));
-
const segmentWords = words.slice(wordIndex, wordIndex + newWordCount);
-
wordIndex += newWordCount;
+
+
if (!response.ok) {
+
const errorText = await response.text();
+
console.error(`[VTTCleaner] OpenRouter error for ${transcriptionId}:`, errorText);
+
console.warn("[VTTCleaner] Falling back to uncleaned VTT");
+
return vttContent;
+
}
-
cleanedSegments.push({
-
index: segment.index,
-
timestamp: segment.timestamp,
-
text: segmentWords.join(" "),
-
start: segment.start,
-
end: segment.end,
-
});
-
}
+
const result = await response.json();
+
const cleanedVTT = result.choices?.[0]?.message?.content?.trim();
-
// If any remaining words, append to last segment
-
if (wordIndex < words.length && cleanedSegments.length > 0) {
-
const rest = words.slice(wordIndex).join(" ");
-
const lastIdx = cleanedSegments.length - 1;
-
const lastSeg = cleanedSegments[lastIdx];
-
if (lastSeg) {
-
lastSeg.text += (lastSeg.text ? " " : "") + rest;
+
if (!cleanedVTT) {
+
console.warn("[VTTCleaner] Empty response from AI, returning uncleaned VTT");
+
return vttContent;
}
-
}
-
// Assign paragraph-based IDs to segments
-
for (let i = 0; i < cleanedSegments.length; i++) {
-
const seg = cleanedSegments[i];
-
if (!seg) continue;
+
// Extract VTT content if the model wrapped it in markdown
+
let finalVTT = cleanedVTT;
+
if (cleanedVTT.includes("```")) {
+
const vttMatch = cleanedVTT.match(/```(?:vtt)?\n([\s\S]*?)```/);
+
if (vttMatch?.[1]) {
+
finalVTT = vttMatch[1].trim();
+
}
+
}
-
// Find which paragraph this segment belongs to
-
let paraIndex = 0;
-
let segmentInPara = 1;
-
for (let p = 0; p < paragraphBoundaries.length; p++) {
-
const para = paragraphBoundaries[p];
-
if (i >= para.startSegmentIndex && i <= para.endSegmentIndex) {
-
paraIndex = p + 1;
-
segmentInPara = i - para.startSegmentIndex + 1;
-
break;
+
// Ensure it starts with WEBVTT
+
if (!finalVTT.startsWith("WEBVTT")) {
+
const webvttIndex = finalVTT.indexOf("WEBVTT");
+
if (webvttIndex !== -1) {
+
finalVTT = finalVTT.substring(webvttIndex);
+
} else {
+
finalVTT = `WEBVTT\n\n${finalVTT}`;
}
}
-
// Use paragraph-based ID: "Paragraph N-M" where N is paragraph number, M is segment within paragraph
-
seg.index = `Paragraph ${paraIndex}-${segmentInPara}`;
-
}
+
console.log(
+
`[VTTCleaner] Successfully cleaned ${segments.length} segments using AI`,
+
);
-
// Build output VTT with cleaned segment cues having paragraph-based IDs
-
let output = "WEBVTT\n\n";
-
for (const seg of cleanedSegments) {
-
if (!seg || !seg.timestamp || !seg.text) continue;
-
output += `${seg.index}\n`;
-
output += `${seg.timestamp}\n`;
-
output += `${seg.text}\n\n`;
+
return finalVTT;
+
} catch (err) {
+
console.error("[VTTCleaner] Exception:", err);
+
console.warn("[VTTCleaner] Falling back to uncleaned VTT");
+
return vttContent;
}
-
-
console.log(
-
`[VTTCleaner] Completed for ${transcriptionId}: ${cleanedSegments.length} segments in ${paragraphBoundaries.length} paragraphs`,
-
);
-
-
return output;
}