src/lib/vtt-cleaner.ts at e11f0959e241b3bfda787bbb6238544b771255cd · dunkirk.sh/thistle

dunkirk.sh / thistle
🪻 distributed transcription service thistle.dunkirk.sh
thistle / src / lib / vtt-cleaner.ts
at e11f0959e241b3bfda787bbb6238544b771255cd 14 kB view raw
  1// Parse and clean VTT files using AI
  2
  3interface VTTSegment {
  4	index?: number | string;
  5	timestamp: string;
  6	text: string;
  7	start?: number;
  8	end?: number;
  9}
 10
 11/**
 12 * Parse a VTT timestamp string (hh:mm:ss.mmm or mm:ss.mmm) into seconds
 13 */
 14function parseTimestampToSeconds(ts?: string): number {
 15	if (!ts) return 0;
 16	// ts expected like "00:00:09.039"
 17	const parts = ts.split(":").map((p) => p.trim());
 18	const hh = parts[0] ?? "0";
 19	const mm = parts[1] ?? "0";
 20	const ss = parts[2] ?? "0";
 21	if (parts.length === 3) {
 22		const seconds =
 23			parseInt(hh, 10) * 3600 + parseInt(mm, 10) * 60 + parseFloat(ss);
 24		return seconds;
 25	} else if (parts.length === 2) {
 26		return parseInt(mm, 10) * 60 + parseFloat(ss);
 27	}
 28	return 0;
 29}
 30
 31/**
 32 * Parse VTT content into segments, populating start/end in seconds
 33 */
 34export function parseVTT(vttContent: string): VTTSegment[] {
 35	const lines = vttContent.split("\n");
 36	const segments: VTTSegment[] = [];
 37	let currentSegment: Partial<VTTSegment> = {};
 38
 39	for (let i = 0; i < lines.length; i++) {
 40		const line = lines[i]?.trim();
 41
 42		if (!line) {
 43			if (currentSegment.timestamp && currentSegment.text) {
 44				// parse start/end
 45				const match = /([\d:.]+)\s*-->\s*([\d:.]+)/.exec(
 46					currentSegment.timestamp || "",
 47				);
 48				if (match) {
 49					currentSegment.start = parseTimestampToSeconds(match[1]);
 50					currentSegment.end = parseTimestampToSeconds(match[2]);
 51				}
 52				segments.push(currentSegment as VTTSegment);
 53				currentSegment = {};
 54			}
 55			continue;
 56		}
 57
 58		if (line === "WEBVTT") {
 59			continue;
 60		}
 61
 62		// Check if it's a cue id (before timestamp)
 63		if (!currentSegment.timestamp && line && !line.includes("-->")) {
 64			currentSegment.index = line;
 65			continue;
 66		}
 67
 68		// Check if it's a timestamp line
 69		if (line.includes("-->")) {
 70			currentSegment.timestamp = line;
 71			// Next line(s) will be text
 72			const textLines: string[] = [];
 73			i++;
 74			while (
 75				i < lines.length &&
 76				lines[i]?.trim() &&
 77				!lines[i]?.includes("-->")
 78			) {
 79				textLines.push(lines[i] || "");
 80				i++;
 81			}
 82			currentSegment.text = textLines.join("\n").trim();
 83			i--; // Back up one since the loop will increment
 84		} else if (/^\d+$/.test(line)) {
 85			// It's an index number
 86			currentSegment.index = Number.parseInt(line, 10);
 87		}
 88	}
 89
 90	// Add last segment if exists
 91	if (currentSegment.timestamp && currentSegment.text) {
 92		const match = /([\d:.]+)\s*-->\s*([\d:.]+)/.exec(
 93			currentSegment.timestamp || "",
 94		);
 95		if (match?.[1] && match[2]) {
 96			currentSegment.start = parseTimestampToSeconds(match[1]);
 97			currentSegment.end = parseTimestampToSeconds(match[2]);
 98		}
 99		segments.push(currentSegment as VTTSegment);
100	}
101
102	return segments;
103}
104
105/**
106 * Chunk size for VTT processing
107 */
108const CHUNK_SIZE = 40; // Segments per chunk
109
110/**
111 * Find paragraph boundaries in processed VTT content
112 * Returns the segments in the last paragraph and highest paragraph number found
113 */
114function extractLastParagraphAndHighestNumber(vttContent: string): {
115	segments: string;
116	paragraphNumber: string | null;
117	highestParagraphNumber: number;
118} {
119	if (!vttContent)
120		return { segments: "", paragraphNumber: null, highestParagraphNumber: 0 };
121
122	// Split into segments (separated by double newline)
123	const segments = vttContent.split("\n\n").filter(Boolean);
124	if (segments.length === 0)
125		return { segments: "", paragraphNumber: null, highestParagraphNumber: 0 };
126
127	// Get all segments from the last paragraph number
128	const lastSegments: string[] = [];
129	let currentParagraphNumber: string | null = null;
130	let highestParagraphNumber = 0;
131
132	// First, scan through all segments to find the highest paragraph number
133	for (const segment of segments) {
134		if (!segment) continue;
135
136		const lines = segment.split("\n");
137		const firstLine = lines[0] || "";
138
139		// Check for paragraph number pattern
140		const paragraphMatch = /Paragraph (\d+)-\d+/.exec(firstLine);
141		if (paragraphMatch?.[1]) {
142			const paragraphNum = parseInt(paragraphMatch[1], 10);
143			if (
144				!Number.isNaN(paragraphNum) &&
145				paragraphNum > highestParagraphNumber
146			) {
147				highestParagraphNumber = paragraphNum;
148			}
149		}
150	}
151
152	// Start from the end and work backwards to find the last paragraph
153	for (let i = segments.length - 1; i >= 0; i--) {
154		const segment = segments[i];
155		if (!segment) continue;
156
157		const lines = segment.split("\n");
158		const firstLine = lines[0] || "";
159
160		// Check for paragraph number pattern
161		const paragraphMatch = /Paragraph (\d+)-\d+/.exec(firstLine);
162		if (paragraphMatch?.[1]) {
163			const paragraphNumber = paragraphMatch[1];
164
165			if (!currentParagraphNumber) {
166				// This is the first paragraph number we've found working backwards
167				currentParagraphNumber = paragraphNumber;
168				lastSegments.unshift(segment);
169			} else if (paragraphNumber === currentParagraphNumber) {
170				// Same paragraph, add it
171				lastSegments.unshift(segment);
172			} else {
173				// Different paragraph, we're done
174				break;
175			}
176		} else {
177			// No paragraph number, but might be part of current paragraph
178			// Add it if we've already started collecting segments
179			if (currentParagraphNumber) {
180				lastSegments.unshift(segment);
181			}
182		}
183	}
184
185	return {
186		segments: lastSegments.join("\n\n"),
187		paragraphNumber: currentParagraphNumber,
188		highestParagraphNumber,
189	};
190}
191
192/**
193 * Process a chunk of VTT segments using AI
194 */
195async function processVTTChunk(
196	transcriptionId: string,
197	inputSegments: Array<{ index: number; timestamp: string; text: string }>,
198	chunkIndex: number,
199	previousParagraphNumber: string | null,
200	apiKey: string,
201	apiBaseUrl: string,
202	model: string,
203	previousParagraphText?: string,
204): Promise<string> {
205	const chunkId = `${transcriptionId}-chunk${chunkIndex}`;
206
207	const hasTextContext = !!previousParagraphText;
208
209	console.log(
210		`[VTTCleaner] Processing chunk ${chunkIndex} with ${inputSegments.length} segments${hasTextContext ? " and previous paragraph text context" : ""}`,
211	);
212
213	const nextParagraphNumber = previousParagraphNumber
214		? String(parseInt(previousParagraphNumber, 10) + 1)
215		: "1";
216
217	const prompt = `Can you turn this into a paragraph separated vtt file?
218
219Use the format "Paragraph X-Y" where X is the paragraph number and Y is the segment number within that paragraph:
220
221Paragraph 1-1
22200:00:00.000 --> 00:00:05.559
223Today in chapel we are talking about the fact that we believe in having gospel
224
225Paragraph 1-2
22600:00:05.559 --> 00:00:08.639
227conversations. I'm gonna run my own PowerPoint. I'm gonna jump around. It's
228
229Paragraph 1-3
23000:00:08.639 --> 00:00:11.960
231gonna be a little more conversational than normal.
232
233Paragraph 2-1
23400:00:11.960 --> 00:00:15.000
235Now let's talk about something different.
236
237I want you to preserve sentences across paragraph breaks moving whatever is the smallest amount out to its own segment block.
238
239Here are important guidelines for forming paragraphs:
2401. Create a new paragraph when there's a change in topic or speaker.
2412. Don't make paragraphs too long - aim for 4-5 sentences per paragraph maximum.
2423. Group related thoughts together in the same paragraph.
2434. Start a new paragraph when a sentence introduces a completely new idea.
2445. Focus on the number of sentences, not segments, when creating paragraphs.
2456. The number of segments in a paragraph may vary, but keep paragraphs to a reasonable length.
246
247Also go through and rewrite the words to extract the meaning and not necessarily the exact phrasing if it sounds unnatural when written. I want the text to remain lined up with the original though so don't rewrite entire paragraphs but you can remove ums, alrights, and similar. Also remove all contextual tags like [background noise]. Add punctuation if it's missing to make the text readable. If there is no more context to fit a segment then just skip it and move to the next one.
248
249${
250	hasTextContext
251		? `The following is the last paragraph from the previous chunk and is provided for context only. DO NOT include it in your output - it's already in the transcript:
252
253${previousParagraphText}
254
255Now process the following new segments, continuing from the previous paragraph. ${previousParagraphNumber ? `Start your paragraphs with number ${nextParagraphNumber} (unless you're continuing the previous paragraph).` : ""}`
256		: "Process the following segments:"
257}
258
259${JSON.stringify(inputSegments, null, 2)}
260
261Return ONLY the VTT content WITHOUT the "WEBVTT" header and nothing else. No explanations or additional text.`;
262
263	try {
264		const response = await fetch(`${apiBaseUrl}/chat/completions`, {
265			method: "POST",
266			headers: {
267				"Content-Type": "application/json",
268				Authorization: `Bearer ${apiKey}`,
269				"HTTP-Referer": process.env.ORIGIN || "http://localhost:3000",
270				"X-Title": `Thistle Transcription Chunk ${chunkIndex}`,
271			},
272			body: JSON.stringify({
273				model,
274				messages: [{ role: "user", content: prompt }],
275				temperature: 0.3,
276				max_tokens: 8192, // Reduced for chunks
277			}),
278		});
279
280		if (!response.ok) {
281			const errorText = await response.text();
282			console.error(`[VTTCleaner] OpenRouter error for ${chunkId}:`, errorText);
283			throw new Error(`API error: ${response.status}`);
284		}
285
286		const result = await response.json();
287		const cleanedVTT = result.choices?.[0]?.message?.content?.trim();
288
289		if (!cleanedVTT) {
290			throw new Error("Empty response from AI");
291		}
292
293		// Extract VTT content if the model wrapped it in markdown
294		let chunkVTT = cleanedVTT;
295		if (cleanedVTT.includes("```")) {
296			const vttMatch = cleanedVTT.match(/```(?:vtt)?\n([\s\S]*?)```/);
297			if (vttMatch?.[1]) {
298				chunkVTT = vttMatch[1].trim();
299			}
300		}
301
302		// Remove WEBVTT header if present (we'll add it once at the end)
303		if (chunkVTT.startsWith("WEBVTT")) {
304			const lines = chunkVTT.split("\n");
305			// Skip WEBVTT line and any blank lines that follow
306			let i = 1;
307			while (i < lines.length && !lines[i]?.trim()) {
308				i++;
309			}
310			chunkVTT = lines.slice(i).join("\n");
311		}
312
313		console.log(`[VTTCleaner] Successfully processed chunk ${chunkIndex}`);
314		return chunkVTT;
315	} catch (error) {
316		console.error(`[VTTCleaner] Exception in chunk ${chunkIndex}:`, error);
317		throw error;
318	}
319}
320
321/**
322 * Clean VTT text using AI to create paragraph-separated VTT file.
323 * Uses OpenRouter API to intelligently group segments into paragraphs
324 * while preserving timing information. Processes sequentially in chunks
325 * with context from previous chunks to maintain paragraph continuity.
326 */
327export async function cleanVTT(
328	transcriptionId: string,
329	vttContent: string,
330): Promise<string> {
331	const segments = parseVTT(vttContent);
332
333	if (segments.length === 0) {
334		return vttContent;
335	}
336
337	console.log(
338		`[VTTCleaner] Processing ${segments.length} segments for ${transcriptionId}`,
339	);
340
341	// Validated at startup
342	const apiKey = process.env.LLM_API_KEY as string;
343	const apiBaseUrl = process.env.LLM_API_BASE_URL as string;
344	const model = process.env.LLM_MODEL as string;
345
346	try {
347		// Build the input segments
348		const inputSegments = segments.map((seg, idx) => ({
349			index: idx,
350			timestamp: seg.timestamp,
351			text: seg.text,
352		}));
353
354		// Prepare chunks for sequential processing
355		const chunks: Array<typeof inputSegments> = [];
356		for (let i = 0; i < inputSegments.length; i += CHUNK_SIZE) {
357			// Don't go beyond array bounds
358			const end = Math.min(i + CHUNK_SIZE, inputSegments.length);
359			chunks.push(inputSegments.slice(i, end));
360		}
361
362		console.log(
363			`[VTTCleaner] Split into ${chunks.length} chunks for sequential processing with paragraph context`,
364		);
365
366		// Process chunks sequentially with context from previous chunk
367		const processedChunks: string[] = [];
368		let previousParagraphText: string | undefined;
369		let previousParagraphNumber: string | null = null;
370
371		for (let i = 0; i < chunks.length; i++) {
372			const chunk = chunks[i];
373			if (!chunk || chunk.length === 0) continue;
374
375			try {
376				const processedChunk = await processVTTChunk(
377					transcriptionId,
378					chunk,
379					i,
380					previousParagraphNumber,
381					apiKey,
382					apiBaseUrl,
383					model,
384					previousParagraphText,
385				);
386				processedChunks.push(processedChunk);
387				console.log(
388					`[VTTCleaner] Completed chunk ${i}/${chunks.length - 1}${previousParagraphText ? " (with context)" : ""}`,
389				);
390
391				// Extract context for the next chunk
392				if (i < chunks.length - 1) {
393					const {
394						segments: lastParagraphText,
395						paragraphNumber,
396						highestParagraphNumber,
397					} = extractLastParagraphAndHighestNumber(processedChunk);
398
399					if (lastParagraphText) {
400						console.log(
401							`[VTTCleaner] Using paragraph ${paragraphNumber || "unknown"} as context for next chunk (highest paragraph: ${highestParagraphNumber})`,
402						);
403						previousParagraphText = lastParagraphText;
404						previousParagraphNumber = highestParagraphNumber.toString();
405					} else {
406						previousParagraphText = undefined;
407						previousParagraphNumber = null;
408					}
409				}
410			} catch (error) {
411				console.error(`[VTTCleaner] Chunk ${i} failed:`, error);
412				// Return the original segments for this chunk if processing fails
413				const fallbackChunk = chunk
414					.map((seg) => `${seg.index || ""}\n${seg.timestamp}\n${seg.text}`)
415					.join("\n\n");
416				processedChunks.push(fallbackChunk);
417				previousParagraphText = undefined;
418				previousParagraphNumber = null;
419			}
420		}
421
422		// Combine all processed chunks
423		const finalVTT = `WEBVTT\n\n${processedChunks.join("\n\n")}`;
424
425		console.log(
426			`[VTTCleaner] Successfully cleaned ${segments.length} segments in ${chunks.length} sequential chunks with paragraph context`,
427		);
428
429		return finalVTT;
430	} catch (error) {
431		console.error("[VTTCleaner] Exception:", error);
432		console.warn("[VTTCleaner] Falling back to uncleaned VTT");
433		return vttContent;
434	}
435}