馃 distributed transcription service thistle.dunkirk.sh
at v0.1.0 14 kB view raw
1// Parse and clean VTT files using AI 2 3interface VTTSegment { 4 index?: number | string; 5 timestamp: string; 6 text: string; 7 start?: number; 8 end?: number; 9} 10 11/** 12 * Parse a VTT timestamp string (hh:mm:ss.mmm or mm:ss.mmm) into seconds 13 */ 14function parseTimestampToSeconds(ts?: string): number { 15 if (!ts) return 0; 16 // ts expected like "00:00:09.039" 17 const parts = ts.split(":").map((p) => p.trim()); 18 const hh = parts[0] ?? "0"; 19 const mm = parts[1] ?? "0"; 20 const ss = parts[2] ?? "0"; 21 if (parts.length === 3) { 22 const seconds = 23 parseInt(hh, 10) * 3600 + parseInt(mm, 10) * 60 + parseFloat(ss); 24 return seconds; 25 } else if (parts.length === 2) { 26 return parseInt(mm, 10) * 60 + parseFloat(ss); 27 } 28 return 0; 29} 30 31/** 32 * Parse VTT content into segments, populating start/end in seconds 33 */ 34export function parseVTT(vttContent: string): VTTSegment[] { 35 const lines = vttContent.split("\n"); 36 const segments: VTTSegment[] = []; 37 let currentSegment: Partial<VTTSegment> = {}; 38 39 for (let i = 0; i < lines.length; i++) { 40 const line = lines[i]?.trim(); 41 42 if (!line) { 43 if (currentSegment.timestamp && currentSegment.text) { 44 // parse start/end 45 const match = /([\d:.]+)\s*-->\s*([\d:.]+)/.exec( 46 currentSegment.timestamp || "", 47 ); 48 if (match) { 49 currentSegment.start = parseTimestampToSeconds(match[1]); 50 currentSegment.end = parseTimestampToSeconds(match[2]); 51 } 52 segments.push(currentSegment as VTTSegment); 53 currentSegment = {}; 54 } 55 continue; 56 } 57 58 if (line === "WEBVTT") { 59 continue; 60 } 61 62 // Check if it's a cue id (before timestamp) 63 if (!currentSegment.timestamp && line && !line.includes("-->")) { 64 currentSegment.index = line; 65 continue; 66 } 67 68 // Check if it's a timestamp line 69 if (line.includes("-->")) { 70 currentSegment.timestamp = line; 71 // Next line(s) will be text 72 const textLines: string[] = []; 73 i++; 74 while ( 75 i < lines.length && 76 lines[i]?.trim() && 77 !lines[i]?.includes("-->") 78 ) { 79 textLines.push(lines[i] || ""); 80 i++; 81 } 82 currentSegment.text = textLines.join("\n").trim(); 83 i--; // Back up one since the loop will increment 84 } else if (/^\d+$/.test(line)) { 85 // It's an index number 86 currentSegment.index = Number.parseInt(line, 10); 87 } 88 } 89 90 // Add last segment if exists 91 if (currentSegment.timestamp && currentSegment.text) { 92 const match = /([\d:.]+)\s*-->\s*([\d:.]+)/.exec( 93 currentSegment.timestamp || "", 94 ); 95 if (match?.[1] && match[2]) { 96 currentSegment.start = parseTimestampToSeconds(match[1]); 97 currentSegment.end = parseTimestampToSeconds(match[2]); 98 } 99 segments.push(currentSegment as VTTSegment); 100 } 101 102 return segments; 103} 104 105/** 106 * Chunk size for VTT processing 107 */ 108const CHUNK_SIZE = 40; // Segments per chunk 109 110/** 111 * Find paragraph boundaries in processed VTT content 112 * Returns the segments in the last paragraph and highest paragraph number found 113 */ 114function extractLastParagraphAndHighestNumber(vttContent: string): { 115 segments: string; 116 paragraphNumber: string | null; 117 highestParagraphNumber: number; 118} { 119 if (!vttContent) 120 return { segments: "", paragraphNumber: null, highestParagraphNumber: 0 }; 121 122 // Split into segments (separated by double newline) 123 const segments = vttContent.split("\n\n").filter(Boolean); 124 if (segments.length === 0) 125 return { segments: "", paragraphNumber: null, highestParagraphNumber: 0 }; 126 127 // Get all segments from the last paragraph number 128 const lastSegments: string[] = []; 129 let currentParagraphNumber: string | null = null; 130 let highestParagraphNumber = 0; 131 132 // First, scan through all segments to find the highest paragraph number 133 for (const segment of segments) { 134 if (!segment) continue; 135 136 const lines = segment.split("\n"); 137 const firstLine = lines[0] || ""; 138 139 // Check for paragraph number pattern 140 const paragraphMatch = /Paragraph (\d+)-\d+/.exec(firstLine); 141 if (paragraphMatch?.[1]) { 142 const paragraphNum = parseInt(paragraphMatch[1], 10); 143 if ( 144 !Number.isNaN(paragraphNum) && 145 paragraphNum > highestParagraphNumber 146 ) { 147 highestParagraphNumber = paragraphNum; 148 } 149 } 150 } 151 152 // Start from the end and work backwards to find the last paragraph 153 for (let i = segments.length - 1; i >= 0; i--) { 154 const segment = segments[i]; 155 if (!segment) continue; 156 157 const lines = segment.split("\n"); 158 const firstLine = lines[0] || ""; 159 160 // Check for paragraph number pattern 161 const paragraphMatch = /Paragraph (\d+)-\d+/.exec(firstLine); 162 if (paragraphMatch?.[1]) { 163 const paragraphNumber = paragraphMatch[1]; 164 165 if (!currentParagraphNumber) { 166 // This is the first paragraph number we've found working backwards 167 currentParagraphNumber = paragraphNumber; 168 lastSegments.unshift(segment); 169 } else if (paragraphNumber === currentParagraphNumber) { 170 // Same paragraph, add it 171 lastSegments.unshift(segment); 172 } else { 173 // Different paragraph, we're done 174 break; 175 } 176 } else { 177 // No paragraph number, but might be part of current paragraph 178 // Add it if we've already started collecting segments 179 if (currentParagraphNumber) { 180 lastSegments.unshift(segment); 181 } 182 } 183 } 184 185 return { 186 segments: lastSegments.join("\n\n"), 187 paragraphNumber: currentParagraphNumber, 188 highestParagraphNumber, 189 }; 190} 191 192/** 193 * Process a chunk of VTT segments using AI 194 */ 195async function processVTTChunk( 196 transcriptionId: string, 197 inputSegments: Array<{ index: number; timestamp: string; text: string }>, 198 chunkIndex: number, 199 previousParagraphNumber: string | null, 200 apiKey: string, 201 apiBaseUrl: string, 202 model: string, 203 previousParagraphText?: string, 204): Promise<string> { 205 const chunkId = `${transcriptionId}-chunk${chunkIndex}`; 206 207 const hasTextContext = !!previousParagraphText; 208 209 console.log( 210 `[VTTCleaner] Processing chunk ${chunkIndex} with ${inputSegments.length} segments${hasTextContext ? " and previous paragraph text context" : ""}`, 211 ); 212 213 const nextParagraphNumber = previousParagraphNumber 214 ? String(parseInt(previousParagraphNumber, 10) + 1) 215 : "1"; 216 217 const prompt = `Can you turn this into a paragraph separated vtt file? 218 219Use the format "Paragraph X-Y" where X is the paragraph number and Y is the segment number within that paragraph: 220 221Paragraph 1-1 22200:00:00.000 --> 00:00:05.559 223Today in chapel we are talking about the fact that we believe in having gospel 224 225Paragraph 1-2 22600:00:05.559 --> 00:00:08.639 227conversations. I'm gonna run my own PowerPoint. I'm gonna jump around. It's 228 229Paragraph 1-3 23000:00:08.639 --> 00:00:11.960 231gonna be a little more conversational than normal. 232 233Paragraph 2-1 23400:00:11.960 --> 00:00:15.000 235Now let's talk about something different. 236 237I want you to preserve sentences across paragraph breaks moving whatever is the smallest amount out to its own segment block. 238 239Here are important guidelines for forming paragraphs: 2401. Create a new paragraph when there's a change in topic or speaker. 2412. Don't make paragraphs too long - aim for 4-5 sentences per paragraph maximum. 2423. Group related thoughts together in the same paragraph. 2434. Start a new paragraph when a sentence introduces a completely new idea. 2445. Focus on the number of sentences, not segments, when creating paragraphs. 2456. The number of segments in a paragraph may vary, but keep paragraphs to a reasonable length. 246 247Also go through and rewrite the words to extract the meaning and not necessarily the exact phrasing if it sounds unnatural when written. I want the text to remain lined up with the original though so don't rewrite entire paragraphs but you can remove ums, alrights, and similar. Also remove all contextual tags like [background noise]. Add punctuation if it's missing to make the text readable. If there is no more context to fit a segment then just skip it and move to the next one. 248 249${ 250 hasTextContext 251 ? `The following is the last paragraph from the previous chunk and is provided for context only. DO NOT include it in your output - it's already in the transcript: 252 253${previousParagraphText} 254 255Now process the following new segments, continuing from the previous paragraph. ${previousParagraphNumber ? `Start your paragraphs with number ${nextParagraphNumber} (unless you're continuing the previous paragraph).` : ""}` 256 : "Process the following segments:" 257} 258 259${JSON.stringify(inputSegments, null, 2)} 260 261Return ONLY the VTT content WITHOUT the "WEBVTT" header and nothing else. No explanations or additional text.`; 262 263 try { 264 const response = await fetch(`${apiBaseUrl}/chat/completions`, { 265 method: "POST", 266 headers: { 267 "Content-Type": "application/json", 268 Authorization: `Bearer ${apiKey}`, 269 "HTTP-Referer": "https://thistle.app", 270 "X-Title": `Thistle Transcription Chunk ${chunkIndex}`, 271 }, 272 body: JSON.stringify({ 273 model, 274 messages: [{ role: "user", content: prompt }], 275 temperature: 0.3, 276 max_tokens: 8192, // Reduced for chunks 277 }), 278 }); 279 280 if (!response.ok) { 281 const errorText = await response.text(); 282 console.error(`[VTTCleaner] OpenRouter error for ${chunkId}:`, errorText); 283 throw new Error(`API error: ${response.status}`); 284 } 285 286 const result = await response.json(); 287 const cleanedVTT = result.choices?.[0]?.message?.content?.trim(); 288 289 if (!cleanedVTT) { 290 throw new Error("Empty response from AI"); 291 } 292 293 // Extract VTT content if the model wrapped it in markdown 294 let chunkVTT = cleanedVTT; 295 if (cleanedVTT.includes("```")) { 296 const vttMatch = cleanedVTT.match(/```(?:vtt)?\n([\s\S]*?)```/); 297 if (vttMatch?.[1]) { 298 chunkVTT = vttMatch[1].trim(); 299 } 300 } 301 302 // Remove WEBVTT header if present (we'll add it once at the end) 303 if (chunkVTT.startsWith("WEBVTT")) { 304 const lines = chunkVTT.split("\n"); 305 // Skip WEBVTT line and any blank lines that follow 306 let i = 1; 307 while (i < lines.length && !lines[i]?.trim()) { 308 i++; 309 } 310 chunkVTT = lines.slice(i).join("\n"); 311 } 312 313 console.log(`[VTTCleaner] Successfully processed chunk ${chunkIndex}`); 314 return chunkVTT; 315 } catch (error) { 316 console.error(`[VTTCleaner] Exception in chunk ${chunkIndex}:`, error); 317 throw error; 318 } 319} 320 321/** 322 * Clean VTT text using AI to create paragraph-separated VTT file. 323 * Uses OpenRouter API to intelligently group segments into paragraphs 324 * while preserving timing information. Processes sequentially in chunks 325 * with context from previous chunks to maintain paragraph continuity. 326 */ 327export async function cleanVTT( 328 transcriptionId: string, 329 vttContent: string, 330): Promise<string> { 331 const segments = parseVTT(vttContent); 332 333 if (segments.length === 0) { 334 return vttContent; 335 } 336 337 console.log( 338 `[VTTCleaner] Processing ${segments.length} segments for ${transcriptionId}`, 339 ); 340 341 const apiKey = process.env.LLM_API_KEY; 342 const apiBaseUrl = process.env.LLM_API_BASE_URL; 343 const model = process.env.LLM_MODEL; 344 345 if (!apiKey || !apiBaseUrl || !model) { 346 console.warn( 347 "[VTTCleaner] LLM configuration incomplete (need LLM_API_KEY, LLM_API_BASE_URL, LLM_MODEL), returning uncleaned VTT", 348 ); 349 return vttContent; 350 } 351 352 try { 353 // Build the input segments 354 const inputSegments = segments.map((seg, idx) => ({ 355 index: idx, 356 timestamp: seg.timestamp, 357 text: seg.text, 358 })); 359 360 // Prepare chunks for sequential processing 361 const chunks: Array<typeof inputSegments> = []; 362 for (let i = 0; i < inputSegments.length; i += CHUNK_SIZE) { 363 // Don't go beyond array bounds 364 const end = Math.min(i + CHUNK_SIZE, inputSegments.length); 365 chunks.push(inputSegments.slice(i, end)); 366 } 367 368 console.log( 369 `[VTTCleaner] Split into ${chunks.length} chunks for sequential processing with paragraph context`, 370 ); 371 372 // Process chunks sequentially with context from previous chunk 373 const processedChunks: string[] = []; 374 let previousParagraphText: string | undefined; 375 let previousParagraphNumber: string | null = null; 376 377 for (let i = 0; i < chunks.length; i++) { 378 const chunk = chunks[i]; 379 if (!chunk || chunk.length === 0) continue; 380 381 try { 382 const processedChunk = await processVTTChunk( 383 transcriptionId, 384 chunk, 385 i, 386 previousParagraphNumber, 387 apiKey, 388 apiBaseUrl, 389 model, 390 previousParagraphText, 391 ); 392 processedChunks.push(processedChunk); 393 console.log( 394 `[VTTCleaner] Completed chunk ${i}/${chunks.length - 1}${previousParagraphText ? " (with context)" : ""}`, 395 ); 396 397 // Extract context for the next chunk 398 if (i < chunks.length - 1) { 399 const { 400 segments: lastParagraphText, 401 paragraphNumber, 402 highestParagraphNumber, 403 } = extractLastParagraphAndHighestNumber(processedChunk); 404 405 if (lastParagraphText) { 406 console.log( 407 `[VTTCleaner] Using paragraph ${paragraphNumber || "unknown"} as context for next chunk (highest paragraph: ${highestParagraphNumber})`, 408 ); 409 previousParagraphText = lastParagraphText; 410 previousParagraphNumber = highestParagraphNumber.toString(); 411 } else { 412 previousParagraphText = undefined; 413 previousParagraphNumber = null; 414 } 415 } 416 } catch (error) { 417 console.error(`[VTTCleaner] Chunk ${i} failed:`, error); 418 // Return the original segments for this chunk if processing fails 419 const fallbackChunk = chunk 420 .map((seg) => `${seg.index || ""}\n${seg.timestamp}\n${seg.text}`) 421 .join("\n\n"); 422 processedChunks.push(fallbackChunk); 423 previousParagraphText = undefined; 424 previousParagraphNumber = null; 425 } 426 } 427 428 // Combine all processed chunks 429 const finalVTT = `WEBVTT\n\n${processedChunks.join("\n\n")}`; 430 431 console.log( 432 `[VTTCleaner] Successfully cleaned ${segments.length} segments in ${chunks.length} sequential chunks with paragraph context`, 433 ); 434 435 return finalVTT; 436 } catch (error) { 437 console.error("[VTTCleaner] Exception:", error); 438 console.warn("[VTTCleaner] Falling back to uncleaned VTT"); 439 return vttContent; 440 } 441}