馃 distributed transcription service
thistle.dunkirk.sh
1// Parse and clean VTT files using AI
2
3interface VTTSegment {
4 index?: number | string;
5 timestamp: string;
6 text: string;
7 start?: number;
8 end?: number;
9}
10
11/**
12 * Parse a VTT timestamp string (hh:mm:ss.mmm or mm:ss.mmm) into seconds
13 */
14function parseTimestampToSeconds(ts?: string): number {
15 if (!ts) return 0;
16 // ts expected like "00:00:09.039"
17 const parts = ts.split(":").map((p) => p.trim());
18 const hh = parts[0] ?? "0";
19 const mm = parts[1] ?? "0";
20 const ss = parts[2] ?? "0";
21 if (parts.length === 3) {
22 const seconds =
23 parseInt(hh, 10) * 3600 + parseInt(mm, 10) * 60 + parseFloat(ss);
24 return seconds;
25 } else if (parts.length === 2) {
26 return parseInt(mm, 10) * 60 + parseFloat(ss);
27 }
28 return 0;
29}
30
31/**
32 * Parse VTT content into segments, populating start/end in seconds
33 */
34export function parseVTT(vttContent: string): VTTSegment[] {
35 const lines = vttContent.split("\n");
36 const segments: VTTSegment[] = [];
37 let currentSegment: Partial<VTTSegment> = {};
38
39 for (let i = 0; i < lines.length; i++) {
40 const line = lines[i]?.trim();
41
42 if (!line) {
43 if (currentSegment.timestamp && currentSegment.text) {
44 // parse start/end
45 const match = /([\d:.]+)\s*-->\s*([\d:.]+)/.exec(
46 currentSegment.timestamp || "",
47 );
48 if (match) {
49 currentSegment.start = parseTimestampToSeconds(match[1]);
50 currentSegment.end = parseTimestampToSeconds(match[2]);
51 }
52 segments.push(currentSegment as VTTSegment);
53 currentSegment = {};
54 }
55 continue;
56 }
57
58 if (line === "WEBVTT") {
59 continue;
60 }
61
62 // Check if it's a cue id (before timestamp)
63 if (!currentSegment.timestamp && line && !line.includes("-->")) {
64 currentSegment.index = line;
65 continue;
66 }
67
68 // Check if it's a timestamp line
69 if (line.includes("-->")) {
70 currentSegment.timestamp = line;
71 // Next line(s) will be text
72 const textLines: string[] = [];
73 i++;
74 while (
75 i < lines.length &&
76 lines[i]?.trim() &&
77 !lines[i]?.includes("-->")
78 ) {
79 textLines.push(lines[i] || "");
80 i++;
81 }
82 currentSegment.text = textLines.join("\n").trim();
83 i--; // Back up one since the loop will increment
84 } else if (/^\d+$/.test(line)) {
85 // It's an index number
86 currentSegment.index = Number.parseInt(line, 10);
87 }
88 }
89
90 // Add last segment if exists
91 if (currentSegment.timestamp && currentSegment.text) {
92 const match = /([\d:.]+)\s*-->\s*([\d:.]+)/.exec(
93 currentSegment.timestamp || "",
94 );
95 if (match?.[1] && match[2]) {
96 currentSegment.start = parseTimestampToSeconds(match[1]);
97 currentSegment.end = parseTimestampToSeconds(match[2]);
98 }
99 segments.push(currentSegment as VTTSegment);
100 }
101
102 return segments;
103}
104
105/**
106 * Clean VTT text using AI to create paragraph-separated VTT file.
107 * Uses OpenRouter API to intelligently group segments into paragraphs
108 * while preserving timing information.
109 */
110export async function cleanVTT(
111 transcriptionId: string,
112 vttContent: string,
113): Promise<string> {
114 const segments = parseVTT(vttContent);
115
116 if (segments.length === 0) {
117 return vttContent;
118 }
119
120 console.log(
121 `[VTTCleaner] Processing ${segments.length} segments for ${transcriptionId}`,
122 );
123
124 const apiKey = process.env.LLM_API_KEY;
125 const apiBaseUrl = process.env.LLM_API_BASE_URL;
126 const model = process.env.LLM_MODEL;
127
128 if (!apiKey || !apiBaseUrl || !model) {
129 console.warn("[VTTCleaner] LLM configuration incomplete (need LLM_API_KEY, LLM_API_BASE_URL, LLM_MODEL), returning uncleaned VTT");
130 return vttContent;
131 }
132
133 try {
134 // Build the input for the AI
135 const inputSegments = segments.map((seg, idx) => ({
136 index: idx,
137 timestamp: seg.timestamp,
138 text: seg.text,
139 }));
140
141 const prompt = `Can you turn this into a paragraph separated vtt file?
142
143Use the format "Paragraph X-Y" where X is the paragraph number and Y is the segment number within that paragraph:
144
145Paragraph 1-1
14600:00:00.000 --> 00:00:05.559
147Today in chapel we are talking about the fact that we believe in having gospel
148
149Paragraph 1-2
15000:00:05.559 --> 00:00:08.639
151conversations. I'm gonna run my own PowerPoint. I'm gonna jump around. It's
152
153Paragraph 1-3
15400:00:08.639 --> 00:00:11.960
155gonna be a little more conversational than normal.
156
157Paragraph 2-1
15800:00:11.960 --> 00:00:15.000
159Now let's talk about something different.
160
161I want you to preserve sentences across paragraph breaks moving whatever is the smallest amount out to its own segment block.
162
163Also go through and rewrite the words to extract the meaning and not necessarily the exact phrasing if it sounds unnatural when written. I want the text to remain lined up with the original though so don't rewrite entire paragraphs but you can remove ums, alrights, and similar. Also remove all contextual tags like [background noise]. Add punctuation if it's missing to make the text readable. If there is no more context to fit a segment then just skip it and move to the next one.
164
165Input segments:
166${JSON.stringify(inputSegments, null, 2)}
167
168Return ONLY the VTT content starting with "WEBVTT" and nothing else. No explanations or additional text.`;
169
170 const response = await fetch(
171 `${apiBaseUrl}/chat/completions`,
172 {
173 method: "POST",
174 headers: {
175 "Content-Type": "application/json",
176 "Authorization": `Bearer ${apiKey}`,
177 "HTTP-Referer": "https://thistle.app",
178 "X-Title": "Thistle Transcription",
179 },
180 body: JSON.stringify({
181 model,
182 messages: [
183 { role: "user", content: prompt },
184 ],
185 temperature: 0.3,
186 max_tokens: 16384,
187 }),
188 },
189 );
190
191 if (!response.ok) {
192 const errorText = await response.text();
193 console.error(`[VTTCleaner] OpenRouter error for ${transcriptionId}:`, errorText);
194 console.warn("[VTTCleaner] Falling back to uncleaned VTT");
195 return vttContent;
196 }
197
198 const result = await response.json();
199 const cleanedVTT = result.choices?.[0]?.message?.content?.trim();
200
201 if (!cleanedVTT) {
202 console.warn("[VTTCleaner] Empty response from AI, returning uncleaned VTT");
203 return vttContent;
204 }
205
206 // Extract VTT content if the model wrapped it in markdown
207 let finalVTT = cleanedVTT;
208 if (cleanedVTT.includes("```")) {
209 const vttMatch = cleanedVTT.match(/```(?:vtt)?\n([\s\S]*?)```/);
210 if (vttMatch?.[1]) {
211 finalVTT = vttMatch[1].trim();
212 }
213 }
214
215 // Ensure it starts with WEBVTT
216 if (!finalVTT.startsWith("WEBVTT")) {
217 const webvttIndex = finalVTT.indexOf("WEBVTT");
218 if (webvttIndex !== -1) {
219 finalVTT = finalVTT.substring(webvttIndex);
220 } else {
221 finalVTT = `WEBVTT\n\n${finalVTT}`;
222 }
223 }
224
225 console.log(
226 `[VTTCleaner] Successfully cleaned ${segments.length} segments using AI`,
227 );
228
229 return finalVTT;
230 } catch (err) {
231 console.error("[VTTCleaner] Exception:", err);
232 console.warn("[VTTCleaner] Falling back to uncleaned VTT");
233 return vttContent;
234 }
235}