馃 distributed transcription service
thistle.dunkirk.sh
1// Parse and clean VTT files using AI
2
3interface VTTSegment {
4 index?: number | string;
5 timestamp: string;
6 text: string;
7 start?: number;
8 end?: number;
9}
10
11/**
12 * Parse a VTT timestamp string (hh:mm:ss.mmm or mm:ss.mmm) into seconds
13 */
14function parseTimestampToSeconds(ts?: string): number {
15 if (!ts) return 0;
16 // ts expected like "00:00:09.039"
17 const parts = ts.split(":").map((p) => p.trim());
18 const hh = parts[0] ?? "0";
19 const mm = parts[1] ?? "0";
20 const ss = parts[2] ?? "0";
21 if (parts.length === 3) {
22 const seconds =
23 parseInt(hh, 10) * 3600 + parseInt(mm, 10) * 60 + parseFloat(ss);
24 return seconds;
25 } else if (parts.length === 2) {
26 return parseInt(mm, 10) * 60 + parseFloat(ss);
27 }
28 return 0;
29}
30
31/**
32 * Parse VTT content into segments, populating start/end in seconds
33 */
34export function parseVTT(vttContent: string): VTTSegment[] {
35 const lines = vttContent.split("\n");
36 const segments: VTTSegment[] = [];
37 let currentSegment: Partial<VTTSegment> = {};
38
39 for (let i = 0; i < lines.length; i++) {
40 const line = lines[i]?.trim();
41
42 if (!line) {
43 if (currentSegment.timestamp && currentSegment.text) {
44 // parse start/end
45 const match = /([\d:.]+)\s*-->\s*([\d:.]+)/.exec(
46 currentSegment.timestamp || "",
47 );
48 if (match) {
49 currentSegment.start = parseTimestampToSeconds(match[1]);
50 currentSegment.end = parseTimestampToSeconds(match[2]);
51 }
52 segments.push(currentSegment as VTTSegment);
53 currentSegment = {};
54 }
55 continue;
56 }
57
58 if (line === "WEBVTT") {
59 continue;
60 }
61
62 // Check if it's a cue id (before timestamp)
63 if (!currentSegment.timestamp && line && !line.includes("-->")) {
64 currentSegment.index = line;
65 continue;
66 }
67
68 // Check if it's a timestamp line
69 if (line.includes("-->")) {
70 currentSegment.timestamp = line;
71 // Next line(s) will be text
72 const textLines: string[] = [];
73 i++;
74 while (
75 i < lines.length &&
76 lines[i]?.trim() &&
77 !lines[i]?.includes("-->")
78 ) {
79 textLines.push(lines[i] || "");
80 i++;
81 }
82 currentSegment.text = textLines.join("\n").trim();
83 i--; // Back up one since the loop will increment
84 } else if (/^\d+$/.test(line)) {
85 // It's an index number
86 currentSegment.index = Number.parseInt(line, 10);
87 }
88 }
89
90 // Add last segment if exists
91 if (currentSegment.timestamp && currentSegment.text) {
92 const match = /([\d:.]+)\s*-->\s*([\d:.]+)/.exec(
93 currentSegment.timestamp || "",
94 );
95 if (match?.[1] && match[2]) {
96 currentSegment.start = parseTimestampToSeconds(match[1]);
97 currentSegment.end = parseTimestampToSeconds(match[2]);
98 }
99 segments.push(currentSegment as VTTSegment);
100 }
101
102 return segments;
103}
104
105/**
106 * Chunk size for VTT processing
107 */
108const CHUNK_SIZE = 40; // Segments per chunk
109
110/**
111 * Find paragraph boundaries in processed VTT content
112 * Returns the segments in the last paragraph and highest paragraph number found
113 */
114function extractLastParagraphAndHighestNumber(vttContent: string): {
115 segments: string;
116 paragraphNumber: string | null;
117 highestParagraphNumber: number;
118} {
119 if (!vttContent)
120 return { segments: "", paragraphNumber: null, highestParagraphNumber: 0 };
121
122 // Split into segments (separated by double newline)
123 const segments = vttContent.split("\n\n").filter(Boolean);
124 if (segments.length === 0)
125 return { segments: "", paragraphNumber: null, highestParagraphNumber: 0 };
126
127 // Get all segments from the last paragraph number
128 const lastSegments: string[] = [];
129 let currentParagraphNumber: string | null = null;
130 let highestParagraphNumber = 0;
131
132 // First, scan through all segments to find the highest paragraph number
133 for (const segment of segments) {
134 if (!segment) continue;
135
136 const lines = segment.split("\n");
137 const firstLine = lines[0] || "";
138
139 // Check for paragraph number pattern
140 const paragraphMatch = /Paragraph (\d+)-\d+/.exec(firstLine);
141 if (paragraphMatch?.[1]) {
142 const paragraphNum = parseInt(paragraphMatch[1], 10);
143 if (
144 !Number.isNaN(paragraphNum) &&
145 paragraphNum > highestParagraphNumber
146 ) {
147 highestParagraphNumber = paragraphNum;
148 }
149 }
150 }
151
152 // Start from the end and work backwards to find the last paragraph
153 for (let i = segments.length - 1; i >= 0; i--) {
154 const segment = segments[i];
155 if (!segment) continue;
156
157 const lines = segment.split("\n");
158 const firstLine = lines[0] || "";
159
160 // Check for paragraph number pattern
161 const paragraphMatch = /Paragraph (\d+)-\d+/.exec(firstLine);
162 if (paragraphMatch?.[1]) {
163 const paragraphNumber = paragraphMatch[1];
164
165 if (!currentParagraphNumber) {
166 // This is the first paragraph number we've found working backwards
167 currentParagraphNumber = paragraphNumber;
168 lastSegments.unshift(segment);
169 } else if (paragraphNumber === currentParagraphNumber) {
170 // Same paragraph, add it
171 lastSegments.unshift(segment);
172 } else {
173 // Different paragraph, we're done
174 break;
175 }
176 } else {
177 // No paragraph number, but might be part of current paragraph
178 // Add it if we've already started collecting segments
179 if (currentParagraphNumber) {
180 lastSegments.unshift(segment);
181 }
182 }
183 }
184
185 return {
186 segments: lastSegments.join("\n\n"),
187 paragraphNumber: currentParagraphNumber,
188 highestParagraphNumber,
189 };
190}
191
192/**
193 * Process a chunk of VTT segments using AI
194 */
195async function processVTTChunk(
196 transcriptionId: string,
197 inputSegments: Array<{ index: number; timestamp: string; text: string }>,
198 chunkIndex: number,
199 previousParagraphNumber: string | null,
200 apiKey: string,
201 apiBaseUrl: string,
202 model: string,
203 previousParagraphText?: string,
204): Promise<string> {
205 const chunkId = `${transcriptionId}-chunk${chunkIndex}`;
206
207 const hasTextContext = !!previousParagraphText;
208
209 console.log(
210 `[VTTCleaner] Processing chunk ${chunkIndex} with ${inputSegments.length} segments${hasTextContext ? " and previous paragraph text context" : ""}`,
211 );
212
213 const nextParagraphNumber = previousParagraphNumber
214 ? String(parseInt(previousParagraphNumber, 10) + 1)
215 : "1";
216
217 const prompt = `Can you turn this into a paragraph separated vtt file?
218
219Use the format "Paragraph X-Y" where X is the paragraph number and Y is the segment number within that paragraph:
220
221Paragraph 1-1
22200:00:00.000 --> 00:00:05.559
223Today in chapel we are talking about the fact that we believe in having gospel
224
225Paragraph 1-2
22600:00:05.559 --> 00:00:08.639
227conversations. I'm gonna run my own PowerPoint. I'm gonna jump around. It's
228
229Paragraph 1-3
23000:00:08.639 --> 00:00:11.960
231gonna be a little more conversational than normal.
232
233Paragraph 2-1
23400:00:11.960 --> 00:00:15.000
235Now let's talk about something different.
236
237I want you to preserve sentences across paragraph breaks moving whatever is the smallest amount out to its own segment block.
238
239Here are important guidelines for forming paragraphs:
2401. Create a new paragraph when there's a change in topic or speaker.
2412. Don't make paragraphs too long - aim for 4-5 sentences per paragraph maximum.
2423. Group related thoughts together in the same paragraph.
2434. Start a new paragraph when a sentence introduces a completely new idea.
2445. Focus on the number of sentences, not segments, when creating paragraphs.
2456. The number of segments in a paragraph may vary, but keep paragraphs to a reasonable length.
246
247Also go through and rewrite the words to extract the meaning and not necessarily the exact phrasing if it sounds unnatural when written. I want the text to remain lined up with the original though so don't rewrite entire paragraphs but you can remove ums, alrights, and similar. Also remove all contextual tags like [background noise]. Add punctuation if it's missing to make the text readable. If there is no more context to fit a segment then just skip it and move to the next one.
248
249${
250 hasTextContext
251 ? `The following is the last paragraph from the previous chunk and is provided for context only. DO NOT include it in your output - it's already in the transcript:
252
253${previousParagraphText}
254
255Now process the following new segments, continuing from the previous paragraph. ${previousParagraphNumber ? `Start your paragraphs with number ${nextParagraphNumber} (unless you're continuing the previous paragraph).` : ""}`
256 : "Process the following segments:"
257}
258
259${JSON.stringify(inputSegments, null, 2)}
260
261Return ONLY the VTT content WITHOUT the "WEBVTT" header and nothing else. No explanations or additional text.`;
262
263 try {
264 const response = await fetch(`${apiBaseUrl}/chat/completions`, {
265 method: "POST",
266 headers: {
267 "Content-Type": "application/json",
268 Authorization: `Bearer ${apiKey}`,
269 "HTTP-Referer": process.env.ORIGIN || "http://localhost:3000",
270 "X-Title": `Thistle Transcription Chunk ${chunkIndex}`,
271 },
272 body: JSON.stringify({
273 model,
274 messages: [{ role: "user", content: prompt }],
275 temperature: 0.3,
276 max_tokens: 8192, // Reduced for chunks
277 }),
278 });
279
280 if (!response.ok) {
281 const errorText = await response.text();
282 console.error(`[VTTCleaner] OpenRouter error for ${chunkId}:`, errorText);
283 throw new Error(`API error: ${response.status}`);
284 }
285
286 const result = await response.json();
287 const cleanedVTT = result.choices?.[0]?.message?.content?.trim();
288
289 if (!cleanedVTT) {
290 throw new Error("Empty response from AI");
291 }
292
293 // Extract VTT content if the model wrapped it in markdown
294 let chunkVTT = cleanedVTT;
295 if (cleanedVTT.includes("```")) {
296 const vttMatch = cleanedVTT.match(/```(?:vtt)?\n([\s\S]*?)```/);
297 if (vttMatch?.[1]) {
298 chunkVTT = vttMatch[1].trim();
299 }
300 }
301
302 // Remove WEBVTT header if present (we'll add it once at the end)
303 if (chunkVTT.startsWith("WEBVTT")) {
304 const lines = chunkVTT.split("\n");
305 // Skip WEBVTT line and any blank lines that follow
306 let i = 1;
307 while (i < lines.length && !lines[i]?.trim()) {
308 i++;
309 }
310 chunkVTT = lines.slice(i).join("\n");
311 }
312
313 console.log(`[VTTCleaner] Successfully processed chunk ${chunkIndex}`);
314 return chunkVTT;
315 } catch (error) {
316 console.error(`[VTTCleaner] Exception in chunk ${chunkIndex}:`, error);
317 throw error;
318 }
319}
320
321/**
322 * Clean VTT text using AI to create paragraph-separated VTT file.
323 * Uses OpenRouter API to intelligently group segments into paragraphs
324 * while preserving timing information. Processes sequentially in chunks
325 * with context from previous chunks to maintain paragraph continuity.
326 */
327export async function cleanVTT(
328 transcriptionId: string,
329 vttContent: string,
330): Promise<string> {
331 const segments = parseVTT(vttContent);
332
333 if (segments.length === 0) {
334 return vttContent;
335 }
336
337 console.log(
338 `[VTTCleaner] Processing ${segments.length} segments for ${transcriptionId}`,
339 );
340
341 // Validated at startup
342 const apiKey = process.env.LLM_API_KEY as string;
343 const apiBaseUrl = process.env.LLM_API_BASE_URL as string;
344 const model = process.env.LLM_MODEL as string;
345
346 try {
347 // Build the input segments
348 const inputSegments = segments.map((seg, idx) => ({
349 index: idx,
350 timestamp: seg.timestamp,
351 text: seg.text,
352 }));
353
354 // Prepare chunks for sequential processing
355 const chunks: Array<typeof inputSegments> = [];
356 for (let i = 0; i < inputSegments.length; i += CHUNK_SIZE) {
357 // Don't go beyond array bounds
358 const end = Math.min(i + CHUNK_SIZE, inputSegments.length);
359 chunks.push(inputSegments.slice(i, end));
360 }
361
362 console.log(
363 `[VTTCleaner] Split into ${chunks.length} chunks for sequential processing with paragraph context`,
364 );
365
366 // Process chunks sequentially with context from previous chunk
367 const processedChunks: string[] = [];
368 let previousParagraphText: string | undefined;
369 let previousParagraphNumber: string | null = null;
370
371 for (let i = 0; i < chunks.length; i++) {
372 const chunk = chunks[i];
373 if (!chunk || chunk.length === 0) continue;
374
375 try {
376 const processedChunk = await processVTTChunk(
377 transcriptionId,
378 chunk,
379 i,
380 previousParagraphNumber,
381 apiKey,
382 apiBaseUrl,
383 model,
384 previousParagraphText,
385 );
386 processedChunks.push(processedChunk);
387 console.log(
388 `[VTTCleaner] Completed chunk ${i}/${chunks.length - 1}${previousParagraphText ? " (with context)" : ""}`,
389 );
390
391 // Extract context for the next chunk
392 if (i < chunks.length - 1) {
393 const {
394 segments: lastParagraphText,
395 paragraphNumber,
396 highestParagraphNumber,
397 } = extractLastParagraphAndHighestNumber(processedChunk);
398
399 if (lastParagraphText) {
400 console.log(
401 `[VTTCleaner] Using paragraph ${paragraphNumber || "unknown"} as context for next chunk (highest paragraph: ${highestParagraphNumber})`,
402 );
403 previousParagraphText = lastParagraphText;
404 previousParagraphNumber = highestParagraphNumber.toString();
405 } else {
406 previousParagraphText = undefined;
407 previousParagraphNumber = null;
408 }
409 }
410 } catch (error) {
411 console.error(`[VTTCleaner] Chunk ${i} failed:`, error);
412 // Return the original segments for this chunk if processing fails
413 const fallbackChunk = chunk
414 .map((seg) => `${seg.index || ""}\n${seg.timestamp}\n${seg.text}`)
415 .join("\n\n");
416 processedChunks.push(fallbackChunk);
417 previousParagraphText = undefined;
418 previousParagraphNumber = null;
419 }
420 }
421
422 // Combine all processed chunks
423 const finalVTT = `WEBVTT\n\n${processedChunks.join("\n\n")}`;
424
425 console.log(
426 `[VTTCleaner] Successfully cleaned ${segments.length} segments in ${chunks.length} sequential chunks with paragraph context`,
427 );
428
429 return finalVTT;
430 } catch (error) {
431 console.error("[VTTCleaner] Exception:", error);
432 console.warn("[VTTCleaner] Falling back to uncleaned VTT");
433 return vttContent;
434 }
435}