src/lib/vtt-cleaner.ts at v0.1.0 · dunkirk.sh/thistle

dunkirk.sh / thistle
🪻 distributed transcription service thistle.dunkirk.sh
thistle / src / lib / vtt-cleaner.ts
at v0.1.0 14 kB view raw
  1// Parse and clean VTT files using AI
  2
  3interface VTTSegment {
  4	index?: number | string;
  5	timestamp: string;
  6	text: string;
  7	start?: number;
  8	end?: number;
  9}
 10
 11/**
 12 * Parse a VTT timestamp string (hh:mm:ss.mmm or mm:ss.mmm) into seconds
 13 */
 14function parseTimestampToSeconds(ts?: string): number {
 15	if (!ts) return 0;
 16	// ts expected like "00:00:09.039"
 17	const parts = ts.split(":").map((p) => p.trim());
 18	const hh = parts[0] ?? "0";
 19	const mm = parts[1] ?? "0";
 20	const ss = parts[2] ?? "0";
 21	if (parts.length === 3) {
 22		const seconds =
 23			parseInt(hh, 10) * 3600 + parseInt(mm, 10) * 60 + parseFloat(ss);
 24		return seconds;
 25	} else if (parts.length === 2) {
 26		return parseInt(mm, 10) * 60 + parseFloat(ss);
 27	}
 28	return 0;
 29}
 30
 31/**
 32 * Parse VTT content into segments, populating start/end in seconds
 33 */
 34export function parseVTT(vttContent: string): VTTSegment[] {
 35	const lines = vttContent.split("\n");
 36	const segments: VTTSegment[] = [];
 37	let currentSegment: Partial<VTTSegment> = {};
 38
 39	for (let i = 0; i < lines.length; i++) {
 40		const line = lines[i]?.trim();
 41
 42		if (!line) {
 43			if (currentSegment.timestamp && currentSegment.text) {
 44				// parse start/end
 45				const match = /([\d:.]+)\s*-->\s*([\d:.]+)/.exec(
 46					currentSegment.timestamp || "",
 47				);
 48				if (match) {
 49					currentSegment.start = parseTimestampToSeconds(match[1]);
 50					currentSegment.end = parseTimestampToSeconds(match[2]);
 51				}
 52				segments.push(currentSegment as VTTSegment);
 53				currentSegment = {};
 54			}
 55			continue;
 56		}
 57
 58		if (line === "WEBVTT") {
 59			continue;
 60		}
 61
 62		// Check if it's a cue id (before timestamp)
 63		if (!currentSegment.timestamp && line && !line.includes("-->")) {
 64			currentSegment.index = line;
 65			continue;
 66		}
 67
 68		// Check if it's a timestamp line
 69		if (line.includes("-->")) {
 70			currentSegment.timestamp = line;
 71			// Next line(s) will be text
 72			const textLines: string[] = [];
 73			i++;
 74			while (
 75				i < lines.length &&
 76				lines[i]?.trim() &&
 77				!lines[i]?.includes("-->")
 78			) {
 79				textLines.push(lines[i] || "");
 80				i++;
 81			}
 82			currentSegment.text = textLines.join("\n").trim();
 83			i--; // Back up one since the loop will increment
 84		} else if (/^\d+$/.test(line)) {
 85			// It's an index number
 86			currentSegment.index = Number.parseInt(line, 10);
 87		}
 88	}
 89
 90	// Add last segment if exists
 91	if (currentSegment.timestamp && currentSegment.text) {
 92		const match = /([\d:.]+)\s*-->\s*([\d:.]+)/.exec(
 93			currentSegment.timestamp || "",
 94		);
 95		if (match?.[1] && match[2]) {
 96			currentSegment.start = parseTimestampToSeconds(match[1]);
 97			currentSegment.end = parseTimestampToSeconds(match[2]);
 98		}
 99		segments.push(currentSegment as VTTSegment);
100	}
101
102	return segments;
103}
104
105/**
106 * Chunk size for VTT processing
107 */
108const CHUNK_SIZE = 40; // Segments per chunk
109
110/**
111 * Find paragraph boundaries in processed VTT content
112 * Returns the segments in the last paragraph and highest paragraph number found
113 */
114function extractLastParagraphAndHighestNumber(vttContent: string): {
115	segments: string;
116	paragraphNumber: string | null;
117	highestParagraphNumber: number;
118} {
119	if (!vttContent)
120		return { segments: "", paragraphNumber: null, highestParagraphNumber: 0 };
121
122	// Split into segments (separated by double newline)
123	const segments = vttContent.split("\n\n").filter(Boolean);
124	if (segments.length === 0)
125		return { segments: "", paragraphNumber: null, highestParagraphNumber: 0 };
126
127	// Get all segments from the last paragraph number
128	const lastSegments: string[] = [];
129	let currentParagraphNumber: string | null = null;
130	let highestParagraphNumber = 0;
131
132	// First, scan through all segments to find the highest paragraph number
133	for (const segment of segments) {
134		if (!segment) continue;
135
136		const lines = segment.split("\n");
137		const firstLine = lines[0] || "";
138
139		// Check for paragraph number pattern
140		const paragraphMatch = /Paragraph (\d+)-\d+/.exec(firstLine);
141		if (paragraphMatch?.[1]) {
142			const paragraphNum = parseInt(paragraphMatch[1], 10);
143			if (
144				!Number.isNaN(paragraphNum) &&
145				paragraphNum > highestParagraphNumber
146			) {
147				highestParagraphNumber = paragraphNum;
148			}
149		}
150	}
151
152	// Start from the end and work backwards to find the last paragraph
153	for (let i = segments.length - 1; i >= 0; i--) {
154		const segment = segments[i];
155		if (!segment) continue;
156
157		const lines = segment.split("\n");
158		const firstLine = lines[0] || "";
159
160		// Check for paragraph number pattern
161		const paragraphMatch = /Paragraph (\d+)-\d+/.exec(firstLine);
162		if (paragraphMatch?.[1]) {
163			const paragraphNumber = paragraphMatch[1];
164
165			if (!currentParagraphNumber) {
166				// This is the first paragraph number we've found working backwards
167				currentParagraphNumber = paragraphNumber;
168				lastSegments.unshift(segment);
169			} else if (paragraphNumber === currentParagraphNumber) {
170				// Same paragraph, add it
171				lastSegments.unshift(segment);
172			} else {
173				// Different paragraph, we're done
174				break;
175			}
176		} else {
177			// No paragraph number, but might be part of current paragraph
178			// Add it if we've already started collecting segments
179			if (currentParagraphNumber) {
180				lastSegments.unshift(segment);
181			}
182		}
183	}
184
185	return {
186		segments: lastSegments.join("\n\n"),
187		paragraphNumber: currentParagraphNumber,
188		highestParagraphNumber,
189	};
190}
191
192/**
193 * Process a chunk of VTT segments using AI
194 */
195async function processVTTChunk(
196	transcriptionId: string,
197	inputSegments: Array<{ index: number; timestamp: string; text: string }>,
198	chunkIndex: number,
199	previousParagraphNumber: string | null,
200	apiKey: string,
201	apiBaseUrl: string,
202	model: string,
203	previousParagraphText?: string,
204): Promise<string> {
205	const chunkId = `${transcriptionId}-chunk${chunkIndex}`;
206
207	const hasTextContext = !!previousParagraphText;
208
209	console.log(
210		`[VTTCleaner] Processing chunk ${chunkIndex} with ${inputSegments.length} segments${hasTextContext ? " and previous paragraph text context" : ""}`,
211	);
212
213	const nextParagraphNumber = previousParagraphNumber
214		? String(parseInt(previousParagraphNumber, 10) + 1)
215		: "1";
216
217	const prompt = `Can you turn this into a paragraph separated vtt file?
218
219Use the format "Paragraph X-Y" where X is the paragraph number and Y is the segment number within that paragraph:
220
221Paragraph 1-1
22200:00:00.000 --> 00:00:05.559
223Today in chapel we are talking about the fact that we believe in having gospel
224
225Paragraph 1-2
22600:00:05.559 --> 00:00:08.639
227conversations. I'm gonna run my own PowerPoint. I'm gonna jump around. It's
228
229Paragraph 1-3
23000:00:08.639 --> 00:00:11.960
231gonna be a little more conversational than normal.
232
233Paragraph 2-1
23400:00:11.960 --> 00:00:15.000
235Now let's talk about something different.
236
237I want you to preserve sentences across paragraph breaks moving whatever is the smallest amount out to its own segment block.
238
239Here are important guidelines for forming paragraphs:
2401. Create a new paragraph when there's a change in topic or speaker.
2412. Don't make paragraphs too long - aim for 4-5 sentences per paragraph maximum.
2423. Group related thoughts together in the same paragraph.
2434. Start a new paragraph when a sentence introduces a completely new idea.
2445. Focus on the number of sentences, not segments, when creating paragraphs.
2456. The number of segments in a paragraph may vary, but keep paragraphs to a reasonable length.
246
247Also go through and rewrite the words to extract the meaning and not necessarily the exact phrasing if it sounds unnatural when written. I want the text to remain lined up with the original though so don't rewrite entire paragraphs but you can remove ums, alrights, and similar. Also remove all contextual tags like [background noise]. Add punctuation if it's missing to make the text readable. If there is no more context to fit a segment then just skip it and move to the next one.
248
249${
250	hasTextContext
251		? `The following is the last paragraph from the previous chunk and is provided for context only. DO NOT include it in your output - it's already in the transcript:
252
253${previousParagraphText}
254
255Now process the following new segments, continuing from the previous paragraph. ${previousParagraphNumber ? `Start your paragraphs with number ${nextParagraphNumber} (unless you're continuing the previous paragraph).` : ""}`
256		: "Process the following segments:"
257}
258
259${JSON.stringify(inputSegments, null, 2)}
260
261Return ONLY the VTT content WITHOUT the "WEBVTT" header and nothing else. No explanations or additional text.`;
262
263	try {
264		const response = await fetch(`${apiBaseUrl}/chat/completions`, {
265			method: "POST",
266			headers: {
267				"Content-Type": "application/json",
268				Authorization: `Bearer ${apiKey}`,
269				"HTTP-Referer": "https://thistle.app",
270				"X-Title": `Thistle Transcription Chunk ${chunkIndex}`,
271			},
272			body: JSON.stringify({
273				model,
274				messages: [{ role: "user", content: prompt }],
275				temperature: 0.3,
276				max_tokens: 8192, // Reduced for chunks
277			}),
278		});
279
280		if (!response.ok) {
281			const errorText = await response.text();
282			console.error(`[VTTCleaner] OpenRouter error for ${chunkId}:`, errorText);
283			throw new Error(`API error: ${response.status}`);
284		}
285
286		const result = await response.json();
287		const cleanedVTT = result.choices?.[0]?.message?.content?.trim();
288
289		if (!cleanedVTT) {
290			throw new Error("Empty response from AI");
291		}
292
293		// Extract VTT content if the model wrapped it in markdown
294		let chunkVTT = cleanedVTT;
295		if (cleanedVTT.includes("```")) {
296			const vttMatch = cleanedVTT.match(/```(?:vtt)?\n([\s\S]*?)```/);
297			if (vttMatch?.[1]) {
298				chunkVTT = vttMatch[1].trim();
299			}
300		}
301
302		// Remove WEBVTT header if present (we'll add it once at the end)
303		if (chunkVTT.startsWith("WEBVTT")) {
304			const lines = chunkVTT.split("\n");
305			// Skip WEBVTT line and any blank lines that follow
306			let i = 1;
307			while (i < lines.length && !lines[i]?.trim()) {
308				i++;
309			}
310			chunkVTT = lines.slice(i).join("\n");
311		}
312
313		console.log(`[VTTCleaner] Successfully processed chunk ${chunkIndex}`);
314		return chunkVTT;
315	} catch (error) {
316		console.error(`[VTTCleaner] Exception in chunk ${chunkIndex}:`, error);
317		throw error;
318	}
319}
320
321/**
322 * Clean VTT text using AI to create paragraph-separated VTT file.
323 * Uses OpenRouter API to intelligently group segments into paragraphs
324 * while preserving timing information. Processes sequentially in chunks
325 * with context from previous chunks to maintain paragraph continuity.
326 */
327export async function cleanVTT(
328	transcriptionId: string,
329	vttContent: string,
330): Promise<string> {
331	const segments = parseVTT(vttContent);
332
333	if (segments.length === 0) {
334		return vttContent;
335	}
336
337	console.log(
338		`[VTTCleaner] Processing ${segments.length} segments for ${transcriptionId}`,
339	);
340
341	const apiKey = process.env.LLM_API_KEY;
342	const apiBaseUrl = process.env.LLM_API_BASE_URL;
343	const model = process.env.LLM_MODEL;
344
345	if (!apiKey || !apiBaseUrl || !model) {
346		console.warn(
347			"[VTTCleaner] LLM configuration incomplete (need LLM_API_KEY, LLM_API_BASE_URL, LLM_MODEL), returning uncleaned VTT",
348		);
349		return vttContent;
350	}
351
352	try {
353		// Build the input segments
354		const inputSegments = segments.map((seg, idx) => ({
355			index: idx,
356			timestamp: seg.timestamp,
357			text: seg.text,
358		}));
359
360		// Prepare chunks for sequential processing
361		const chunks: Array<typeof inputSegments> = [];
362		for (let i = 0; i < inputSegments.length; i += CHUNK_SIZE) {
363			// Don't go beyond array bounds
364			const end = Math.min(i + CHUNK_SIZE, inputSegments.length);
365			chunks.push(inputSegments.slice(i, end));
366		}
367
368		console.log(
369			`[VTTCleaner] Split into ${chunks.length} chunks for sequential processing with paragraph context`,
370		);
371
372		// Process chunks sequentially with context from previous chunk
373		const processedChunks: string[] = [];
374		let previousParagraphText: string | undefined;
375		let previousParagraphNumber: string | null = null;
376
377		for (let i = 0; i < chunks.length; i++) {
378			const chunk = chunks[i];
379			if (!chunk || chunk.length === 0) continue;
380
381			try {
382				const processedChunk = await processVTTChunk(
383					transcriptionId,
384					chunk,
385					i,
386					previousParagraphNumber,
387					apiKey,
388					apiBaseUrl,
389					model,
390					previousParagraphText,
391				);
392				processedChunks.push(processedChunk);
393				console.log(
394					`[VTTCleaner] Completed chunk ${i}/${chunks.length - 1}${previousParagraphText ? " (with context)" : ""}`,
395				);
396
397				// Extract context for the next chunk
398				if (i < chunks.length - 1) {
399					const {
400						segments: lastParagraphText,
401						paragraphNumber,
402						highestParagraphNumber,
403					} = extractLastParagraphAndHighestNumber(processedChunk);
404
405					if (lastParagraphText) {
406						console.log(
407							`[VTTCleaner] Using paragraph ${paragraphNumber || "unknown"} as context for next chunk (highest paragraph: ${highestParagraphNumber})`,
408						);
409						previousParagraphText = lastParagraphText;
410						previousParagraphNumber = highestParagraphNumber.toString();
411					} else {
412						previousParagraphText = undefined;
413						previousParagraphNumber = null;
414					}
415				}
416			} catch (error) {
417				console.error(`[VTTCleaner] Chunk ${i} failed:`, error);
418				// Return the original segments for this chunk if processing fails
419				const fallbackChunk = chunk
420					.map((seg) => `${seg.index || ""}\n${seg.timestamp}\n${seg.text}`)
421					.join("\n\n");
422				processedChunks.push(fallbackChunk);
423				previousParagraphText = undefined;
424				previousParagraphNumber = null;
425			}
426		}
427
428		// Combine all processed chunks
429		const finalVTT = `WEBVTT\n\n${processedChunks.join("\n\n")}`;
430
431		console.log(
432			`[VTTCleaner] Successfully cleaned ${segments.length} segments in ${chunks.length} sequential chunks with paragraph context`,
433		);
434
435		return finalVTT;
436	} catch (error) {
437		console.error("[VTTCleaner] Exception:", error);
438		console.warn("[VTTCleaner] Falling back to uncleaned VTT");
439		return vttContent;
440	}
441}