馃 distributed transcription service thistle.dunkirk.sh
1// Parse and clean VTT files using AI 2 3interface VTTSegment { 4 index?: number | string; 5 timestamp: string; 6 text: string; 7 start?: number; 8 end?: number; 9} 10 11/** 12 * Parse a VTT timestamp string (hh:mm:ss.mmm or mm:ss.mmm) into seconds 13 */ 14function parseTimestampToSeconds(ts?: string): number { 15 if (!ts) return 0; 16 // ts expected like "00:00:09.039" 17 const parts = ts.split(":").map((p) => p.trim()); 18 const hh = parts[0] ?? "0"; 19 const mm = parts[1] ?? "0"; 20 const ss = parts[2] ?? "0"; 21 if (parts.length === 3) { 22 const seconds = 23 parseInt(hh, 10) * 3600 + parseInt(mm, 10) * 60 + parseFloat(ss); 24 return seconds; 25 } else if (parts.length === 2) { 26 return parseInt(mm, 10) * 60 + parseFloat(ss); 27 } 28 return 0; 29} 30 31/** 32 * Parse VTT content into segments, populating start/end in seconds 33 */ 34export function parseVTT(vttContent: string): VTTSegment[] { 35 const lines = vttContent.split("\n"); 36 const segments: VTTSegment[] = []; 37 let currentSegment: Partial<VTTSegment> = {}; 38 39 for (let i = 0; i < lines.length; i++) { 40 const line = lines[i]?.trim(); 41 42 if (!line) { 43 if (currentSegment.timestamp && currentSegment.text) { 44 // parse start/end 45 const match = /([\d:.]+)\s*-->\s*([\d:.]+)/.exec( 46 currentSegment.timestamp || "", 47 ); 48 if (match) { 49 currentSegment.start = parseTimestampToSeconds(match[1]); 50 currentSegment.end = parseTimestampToSeconds(match[2]); 51 } 52 segments.push(currentSegment as VTTSegment); 53 currentSegment = {}; 54 } 55 continue; 56 } 57 58 if (line === "WEBVTT") { 59 continue; 60 } 61 62 // Check if it's a cue id (before timestamp) 63 if (!currentSegment.timestamp && line && !line.includes("-->")) { 64 currentSegment.index = line; 65 continue; 66 } 67 68 // Check if it's a timestamp line 69 if (line.includes("-->")) { 70 currentSegment.timestamp = line; 71 // Next line(s) will be text 72 const textLines: string[] = []; 73 i++; 74 while ( 75 i < lines.length && 76 lines[i]?.trim() && 77 !lines[i]?.includes("-->") 78 ) { 79 textLines.push(lines[i] || ""); 80 i++; 81 } 82 currentSegment.text = textLines.join("\n").trim(); 83 i--; // Back up one since the loop will increment 84 } else if (/^\d+$/.test(line)) { 85 // It's an index number 86 currentSegment.index = Number.parseInt(line, 10); 87 } 88 } 89 90 // Add last segment if exists 91 if (currentSegment.timestamp && currentSegment.text) { 92 const match = /([\d:.]+)\s*-->\s*([\d:.]+)/.exec( 93 currentSegment.timestamp || "", 94 ); 95 if (match?.[1] && match[2]) { 96 currentSegment.start = parseTimestampToSeconds(match[1]); 97 currentSegment.end = parseTimestampToSeconds(match[2]); 98 } 99 segments.push(currentSegment as VTTSegment); 100 } 101 102 return segments; 103} 104 105/** 106 * Clean VTT text using AI to create paragraph-separated VTT file. 107 * Uses OpenRouter API to intelligently group segments into paragraphs 108 * while preserving timing information. 109 */ 110export async function cleanVTT( 111 transcriptionId: string, 112 vttContent: string, 113): Promise<string> { 114 const segments = parseVTT(vttContent); 115 116 if (segments.length === 0) { 117 return vttContent; 118 } 119 120 console.log( 121 `[VTTCleaner] Processing ${segments.length} segments for ${transcriptionId}`, 122 ); 123 124 const apiKey = process.env.LLM_API_KEY; 125 const apiBaseUrl = process.env.LLM_API_BASE_URL; 126 const model = process.env.LLM_MODEL; 127 128 if (!apiKey || !apiBaseUrl || !model) { 129 console.warn("[VTTCleaner] LLM configuration incomplete (need LLM_API_KEY, LLM_API_BASE_URL, LLM_MODEL), returning uncleaned VTT"); 130 return vttContent; 131 } 132 133 try { 134 // Build the input for the AI 135 const inputSegments = segments.map((seg, idx) => ({ 136 index: idx, 137 timestamp: seg.timestamp, 138 text: seg.text, 139 })); 140 141 const prompt = `Can you turn this into a paragraph separated vtt file? 142 143Use the format "Paragraph X-Y" where X is the paragraph number and Y is the segment number within that paragraph: 144 145Paragraph 1-1 14600:00:00.000 --> 00:00:05.559 147Today in chapel we are talking about the fact that we believe in having gospel 148 149Paragraph 1-2 15000:00:05.559 --> 00:00:08.639 151conversations. I'm gonna run my own PowerPoint. I'm gonna jump around. It's 152 153Paragraph 1-3 15400:00:08.639 --> 00:00:11.960 155gonna be a little more conversational than normal. 156 157Paragraph 2-1 15800:00:11.960 --> 00:00:15.000 159Now let's talk about something different. 160 161I want you to preserve sentences across paragraph breaks moving whatever is the smallest amount out to its own segment block. 162 163Also go through and rewrite the words to extract the meaning and not necessarily the exact phrasing if it sounds unnatural when written. I want the text to remain lined up with the original though so don't rewrite entire paragraphs but you can remove ums, alrights, and similar. Also remove all contextual tags like [background noise]. Add punctuation if it's missing to make the text readable. If there is no more context to fit a segment then just skip it and move to the next one. 164 165Input segments: 166${JSON.stringify(inputSegments, null, 2)} 167 168Return ONLY the VTT content starting with "WEBVTT" and nothing else. No explanations or additional text.`; 169 170 const response = await fetch( 171 `${apiBaseUrl}/chat/completions`, 172 { 173 method: "POST", 174 headers: { 175 "Content-Type": "application/json", 176 "Authorization": `Bearer ${apiKey}`, 177 "HTTP-Referer": "https://thistle.app", 178 "X-Title": "Thistle Transcription", 179 }, 180 body: JSON.stringify({ 181 model, 182 messages: [ 183 { role: "user", content: prompt }, 184 ], 185 temperature: 0.3, 186 max_tokens: 16384, 187 }), 188 }, 189 ); 190 191 if (!response.ok) { 192 const errorText = await response.text(); 193 console.error(`[VTTCleaner] OpenRouter error for ${transcriptionId}:`, errorText); 194 console.warn("[VTTCleaner] Falling back to uncleaned VTT"); 195 return vttContent; 196 } 197 198 const result = await response.json(); 199 const cleanedVTT = result.choices?.[0]?.message?.content?.trim(); 200 201 if (!cleanedVTT) { 202 console.warn("[VTTCleaner] Empty response from AI, returning uncleaned VTT"); 203 return vttContent; 204 } 205 206 // Extract VTT content if the model wrapped it in markdown 207 let finalVTT = cleanedVTT; 208 if (cleanedVTT.includes("```")) { 209 const vttMatch = cleanedVTT.match(/```(?:vtt)?\n([\s\S]*?)```/); 210 if (vttMatch?.[1]) { 211 finalVTT = vttMatch[1].trim(); 212 } 213 } 214 215 // Ensure it starts with WEBVTT 216 if (!finalVTT.startsWith("WEBVTT")) { 217 const webvttIndex = finalVTT.indexOf("WEBVTT"); 218 if (webvttIndex !== -1) { 219 finalVTT = finalVTT.substring(webvttIndex); 220 } else { 221 finalVTT = `WEBVTT\n\n${finalVTT}`; 222 } 223 } 224 225 console.log( 226 `[VTTCleaner] Successfully cleaned ${segments.length} segments using AI`, 227 ); 228 229 return finalVTT; 230 } catch (err) { 231 console.error("[VTTCleaner] Exception:", err); 232 console.warn("[VTTCleaner] Falling back to uncleaned VTT"); 233 return vttContent; 234 } 235}