···
1
-
// Parse and clean VTT files
3
-
import type { ParagraphBoundary } from "./transcript-cleaner";
1
+
// Parse and clean VTT files using AI
4
+
index?: number | string;
···
108
-
* Clean VTT text segments by removing tags and fixing grammar.
109
-
* Additionally, merge cleaned segments into paragraph cues while preserving
110
-
* stable paragraph IDs (derived from first segment start time).
106
+
* Clean VTT text using AI to create paragraph-separated VTT file.
107
+
* Uses OpenRouter API to intelligently group segments into paragraphs
108
+
* while preserving timing information.
export async function cleanVTT(
···
`[VTTCleaner] Processing ${segments.length} segments for ${transcriptionId}`,
126
-
// Combine all text for cleaning and paragraphing
127
-
const allText = segments.map((s) => s.text).join(" ");
129
-
// Attempt LLM-driven cleaning and paragraphing in one request, fallback to deterministic rules
130
-
let paragraphBoundaries: ParagraphBoundary[] = [];
124
+
const apiKey = process.env.LLM_API_KEY;
125
+
const apiBaseUrl = process.env.LLM_API_BASE_URL;
126
+
const model = process.env.LLM_MODEL;
128
+
if (!apiKey || !apiBaseUrl || !model) {
129
+
console.warn("[VTTCleaner] LLM configuration incomplete (need LLM_API_KEY, LLM_API_BASE_URL, LLM_MODEL), returning uncleaned VTT");
133
-
const { cleanAndGetParagraphBoundaries } = await import(
134
-
"./transcript-cleaner"
136
-
const result = await cleanAndGetParagraphBoundaries({
137
-
transcriptId: transcriptionId,
138
-
rawTranscript: allText,
139
-
segments: segments.map((s) => ({
134
+
// Build the input for the AI
135
+
const inputSegments = segments.map((seg, idx) => ({
137
+
timestamp: seg.timestamp,
141
+
const prompt = `Can you turn this into a paragraph separated vtt file?
143
+
Use the format "Paragraph X-Y" where X is the paragraph number and Y is the segment number within that paragraph:
146
+
00:00:00.000 --> 00:00:05.559
147
+
Today in chapel we are talking about the fact that we believe in having gospel
150
+
00:00:05.559 --> 00:00:08.639
151
+
conversations. I'm gonna run my own PowerPoint. I'm gonna jump around. It's
154
+
00:00:08.639 --> 00:00:11.960
155
+
gonna be a little more conversational than normal.
148
-
if (result?.paragraphs) {
149
-
paragraphBoundaries = result.paragraphs;
153
-
"[VTTCleaner] Consolidated LLM failed, no paragraph detection:",
158
+
00:00:11.960 --> 00:00:15.000
159
+
Now let's talk about something different.
158
-
if (paragraphBoundaries.length === 0) {
159
-
// No paragraphs detected, treat as one big paragraph
160
-
paragraphBoundaries = [
162
-
startSegmentIndex: 0,
163
-
endSegmentIndex: segments.length - 1,
161
+
I want you to preserve sentences across paragraph breaks moving whatever is the smallest amount out to its own segment block.
169
-
// Get the full cleaned transcript from paragraphs
170
-
const cleanedTranscript = paragraphBoundaries.map((p) => p.text).join(" ");
163
+
Also go through and rewrite the words to extract the meaning and not necessarily the exact phrasing if it sounds unnatural when written. I want the text to remain lined up with the original though so don't rewrite entire paragraphs but you can remove ums, alrights, and similar. Also remove all contextual tags like [background noise]. Add punctuation if it's missing to make the text readable. If there is no more context to fit a segment then just skip it and move to the next one.
172
-
// Split cleaned text back into segments proportionally (word-based)
173
-
const words = cleanedTranscript.split(/\s+/).filter(Boolean);
174
-
const originalWords = allText.split(/\s+/).filter(Boolean);
175
-
const ratio = words.length / Math.max(1, originalWords.length);
166
+
${JSON.stringify(inputSegments, null, 2)}
178
-
const cleanedSegments: VTTSegment[] = [];
168
+
Return ONLY the VTT content starting with "WEBVTT" and nothing else. No explanations or additional text.`;
180
-
for (const segment of segments) {
181
-
const originalWordCount = Math.max(
183
-
segment.text.split(/\s+/).filter(Boolean).length,
170
+
const response = await fetch(
171
+
`${apiBaseUrl}/chat/completions`,
175
+
"Content-Type": "application/json",
176
+
"Authorization": `Bearer ${apiKey}`,
177
+
"HTTP-Referer": "https://thistle.app",
178
+
"X-Title": "Thistle Transcription",
180
+
body: JSON.stringify({
183
+
{ role: "user", content: prompt },
185
-
const newWordCount = Math.max(1, Math.round(originalWordCount * ratio));
186
-
const segmentWords = words.slice(wordIndex, wordIndex + newWordCount);
187
-
wordIndex += newWordCount;
191
+
if (!response.ok) {
192
+
const errorText = await response.text();
193
+
console.error(`[VTTCleaner] OpenRouter error for ${transcriptionId}:`, errorText);
194
+
console.warn("[VTTCleaner] Falling back to uncleaned VTT");
189
-
cleanedSegments.push({
190
-
index: segment.index,
191
-
timestamp: segment.timestamp,
192
-
text: segmentWords.join(" "),
193
-
start: segment.start,
198
+
const result = await response.json();
199
+
const cleanedVTT = result.choices?.[0]?.message?.content?.trim();
198
-
// If any remaining words, append to last segment
199
-
if (wordIndex < words.length && cleanedSegments.length > 0) {
200
-
const rest = words.slice(wordIndex).join(" ");
201
-
const lastIdx = cleanedSegments.length - 1;
202
-
const lastSeg = cleanedSegments[lastIdx];
204
-
lastSeg.text += (lastSeg.text ? " " : "") + rest;
202
+
console.warn("[VTTCleaner] Empty response from AI, returning uncleaned VTT");
208
-
// Assign paragraph-based IDs to segments
209
-
for (let i = 0; i < cleanedSegments.length; i++) {
210
-
const seg = cleanedSegments[i];
211
-
if (!seg) continue;
206
+
// Extract VTT content if the model wrapped it in markdown
207
+
let finalVTT = cleanedVTT;
208
+
if (cleanedVTT.includes("```")) {
209
+
const vttMatch = cleanedVTT.match(/```(?:vtt)?\n([\s\S]*?)```/);
210
+
if (vttMatch?.[1]) {
211
+
finalVTT = vttMatch[1].trim();
213
-
// Find which paragraph this segment belongs to
215
-
let segmentInPara = 1;
216
-
for (let p = 0; p < paragraphBoundaries.length; p++) {
217
-
const para = paragraphBoundaries[p];
218
-
if (i >= para.startSegmentIndex && i <= para.endSegmentIndex) {
220
-
segmentInPara = i - para.startSegmentIndex + 1;
215
+
// Ensure it starts with WEBVTT
216
+
if (!finalVTT.startsWith("WEBVTT")) {
217
+
const webvttIndex = finalVTT.indexOf("WEBVTT");
218
+
if (webvttIndex !== -1) {
219
+
finalVTT = finalVTT.substring(webvttIndex);
221
+
finalVTT = `WEBVTT\n\n${finalVTT}`;
225
-
// Use paragraph-based ID: "Paragraph N-M" where N is paragraph number, M is segment within paragraph
226
-
seg.index = `Paragraph ${paraIndex}-${segmentInPara}`;
226
+
`[VTTCleaner] Successfully cleaned ${segments.length} segments using AI`,
229
-
// Build output VTT with cleaned segment cues having paragraph-based IDs
230
-
let output = "WEBVTT\n\n";
231
-
for (const seg of cleanedSegments) {
232
-
if (!seg || !seg.timestamp || !seg.text) continue;
233
-
output += `${seg.index}\n`;
234
-
output += `${seg.timestamp}\n`;
235
-
output += `${seg.text}\n\n`;
231
+
console.error("[VTTCleaner] Exception:", err);
232
+
console.warn("[VTTCleaner] Falling back to uncleaned VTT");
239
-
`[VTTCleaner] Completed for ${transcriptionId}: ${cleanedSegments.length} segments in ${paragraphBoundaries.length} paragraphs`,