···
106
-
* Clean VTT text using AI to create paragraph-separated VTT file.
107
-
* Uses OpenRouter API to intelligently group segments into paragraphs
108
-
* while preserving timing information.
106
+
* Chunk size for VTT processing
110
-
export async function cleanVTT(
111
-
transcriptionId: string,
112
-
vttContent: string,
113
-
): Promise<string> {
114
-
const segments = parseVTT(vttContent);
108
+
const CHUNK_SIZE = 40; // Segments per chunk
116
-
if (segments.length === 0) {
111
+
* Find paragraph boundaries in processed VTT content
112
+
* Returns the segments in the last paragraph and highest paragraph number found
114
+
function extractLastParagraphAndHighestNumber(vttContent: string): {
116
+
paragraphNumber: string | null,
117
+
highestParagraphNumber: number
119
+
if (!vttContent) return { segments: '', paragraphNumber: null, highestParagraphNumber: 0 };
121
+
// Split into segments (separated by double newline)
122
+
const segments = vttContent.split('\n\n').filter(Boolean);
123
+
if (segments.length === 0) return { segments: '', paragraphNumber: null, highestParagraphNumber: 0 };
125
+
// Get all segments from the last paragraph number
126
+
const lastSegments: string[] = [];
127
+
let currentParagraphNumber: string | null = null;
128
+
let highestParagraphNumber = 0;
130
+
// First, scan through all segments to find the highest paragraph number
131
+
for (const segment of segments) {
132
+
if (!segment) continue;
134
+
const lines = segment.split('\n');
135
+
const firstLine = lines[0] || '';
137
+
// Check for paragraph number pattern
138
+
const paragraphMatch = /Paragraph (\d+)-\d+/.exec(firstLine);
139
+
if (paragraphMatch?.[1]) {
140
+
const paragraphNum = parseInt(paragraphMatch[1], 10);
141
+
if (!Number.isNaN(paragraphNum) && paragraphNum > highestParagraphNumber) {
142
+
highestParagraphNumber = paragraphNum;
121
-
`[VTTCleaner] Processing ${segments.length} segments for ${transcriptionId}`,
124
-
const apiKey = process.env.LLM_API_KEY;
125
-
const apiBaseUrl = process.env.LLM_API_BASE_URL;
126
-
const model = process.env.LLM_MODEL;
128
-
if (!apiKey || !apiBaseUrl || !model) {
129
-
console.warn("[VTTCleaner] LLM configuration incomplete (need LLM_API_KEY, LLM_API_BASE_URL, LLM_MODEL), returning uncleaned VTT");
147
+
// Start from the end and work backwards to find the last paragraph
148
+
for (let i = segments.length - 1; i >= 0; i--) {
149
+
const segment = segments[i];
150
+
if (!segment) continue;
152
+
const lines = segment.split('\n');
153
+
const firstLine = lines[0] || '';
155
+
// Check for paragraph number pattern
156
+
const paragraphMatch = /Paragraph (\d+)-\d+/.exec(firstLine);
157
+
if (paragraphMatch?.[1]) {
158
+
const paragraphNumber = paragraphMatch[1];
160
+
if (!currentParagraphNumber) {
161
+
// This is the first paragraph number we've found working backwards
162
+
currentParagraphNumber = paragraphNumber;
163
+
lastSegments.unshift(segment);
164
+
} else if (paragraphNumber === currentParagraphNumber) {
165
+
// Same paragraph, add it
166
+
lastSegments.unshift(segment);
168
+
// Different paragraph, we're done
172
+
// No paragraph number, but might be part of current paragraph
173
+
// Add it if we've already started collecting segments
174
+
if (currentParagraphNumber) {
175
+
lastSegments.unshift(segment);
181
+
segments: lastSegments.join('\n\n'),
182
+
paragraphNumber: currentParagraphNumber,
183
+
highestParagraphNumber
134
-
// Build the input for the AI
135
-
const inputSegments = segments.map((seg, idx) => ({
137
-
timestamp: seg.timestamp,
141
-
const prompt = `Can you turn this into a paragraph separated vtt file?
188
+
* Process a chunk of VTT segments using AI
190
+
async function processVTTChunk(
191
+
transcriptionId: string,
192
+
inputSegments: Array<{index: number, timestamp: string, text: string}>,
193
+
chunkIndex: number,
194
+
previousParagraphNumber: string | null,
196
+
apiBaseUrl: string,
198
+
previousParagraphText?: string,
199
+
): Promise<string> {
200
+
const chunkId = `${transcriptionId}-chunk${chunkIndex}`;
202
+
const hasTextContext = !!previousParagraphText;
204
+
console.log(`[VTTCleaner] Processing chunk ${chunkIndex} with ${inputSegments.length} segments${hasTextContext ? ' and previous paragraph text context' : ''}`);
206
+
const nextParagraphNumber = previousParagraphNumber ? String(parseInt(previousParagraphNumber, 10) + 1) : '1';
208
+
const prompt = `Can you turn this into a paragraph separated vtt file?
Use the format "Paragraph X-Y" where X is the paragraph number and Y is the segment number within that paragraph:
···
I want you to preserve sentences across paragraph breaks moving whatever is the smallest amount out to its own segment block.
230
+
Here are important guidelines for forming paragraphs:
231
+
1. Create a new paragraph when there's a change in topic or speaker.
232
+
2. Don't make paragraphs too long - aim for 4-5 sentences per paragraph maximum.
233
+
3. Group related thoughts together in the same paragraph.
234
+
4. Start a new paragraph when a sentence introduces a completely new idea.
235
+
5. Focus on the number of sentences, not segments, when creating paragraphs.
236
+
6. The number of segments in a paragraph may vary, but keep paragraphs to a reasonable length.
Also go through and rewrite the words to extract the meaning and not necessarily the exact phrasing if it sounds unnatural when written. I want the text to remain lined up with the original though so don't rewrite entire paragraphs but you can remove ums, alrights, and similar. Also remove all contextual tags like [background noise]. Add punctuation if it's missing to make the text readable. If there is no more context to fit a segment then just skip it and move to the next one.
241
+
`The following is the last paragraph from the previous chunk and is provided for context only. DO NOT include it in your output - it's already in the transcript:
243
+
${previousParagraphText}
245
+
Now process the following new segments, continuing from the previous paragraph. ${previousParagraphNumber ? `Start your paragraphs with number ${nextParagraphNumber} (unless you're continuing the previous paragraph).` : ''}`
246
+
: 'Process the following segments:'}
${JSON.stringify(inputSegments, null, 2)}
168
-
Return ONLY the VTT content starting with "WEBVTT" and nothing else. No explanations or additional text.`;
250
+
Return ONLY the VTT content WITHOUT the "WEBVTT" header and nothing else. No explanations or additional text.`;
const response = await fetch(
`${apiBaseUrl}/chat/completions`,
···
"Content-Type": "application/json",
"Authorization": `Bearer ${apiKey}`,
"HTTP-Referer": "https://thistle.app",
178
-
"X-Title": "Thistle Transcription",
261
+
"X-Title": `Thistle Transcription Chunk ${chunkIndex}`,
···
{ role: "user", content: prompt },
269
+
max_tokens: 8192, // Reduced for chunks
const errorText = await response.text();
193
-
console.error(`[VTTCleaner] OpenRouter error for ${transcriptionId}:`, errorText);
194
-
console.warn("[VTTCleaner] Falling back to uncleaned VTT");
276
+
console.error(`[VTTCleaner] OpenRouter error for ${chunkId}:`, errorText);
277
+
throw new Error(`API error: ${response.status}`);
const result = await response.json();
const cleanedVTT = result.choices?.[0]?.message?.content?.trim();
202
-
console.warn("[VTTCleaner] Empty response from AI, returning uncleaned VTT");
284
+
throw new Error("Empty response from AI");
// Extract VTT content if the model wrapped it in markdown
207
-
let finalVTT = cleanedVTT;
288
+
let chunkVTT = cleanedVTT;
if (cleanedVTT.includes("```")) {
const vttMatch = cleanedVTT.match(/```(?:vtt)?\n([\s\S]*?)```/);
211
-
finalVTT = vttMatch[1].trim();
292
+
chunkVTT = vttMatch[1].trim();
215
-
// Ensure it starts with WEBVTT
216
-
if (!finalVTT.startsWith("WEBVTT")) {
217
-
const webvttIndex = finalVTT.indexOf("WEBVTT");
218
-
if (webvttIndex !== -1) {
219
-
finalVTT = finalVTT.substring(webvttIndex);
221
-
finalVTT = `WEBVTT\n\n${finalVTT}`;
296
+
// Remove WEBVTT header if present (we'll add it once at the end)
297
+
if (chunkVTT.startsWith("WEBVTT")) {
298
+
const lines = chunkVTT.split("\n");
299
+
// Skip WEBVTT line and any blank lines that follow
301
+
while (i < lines.length && !lines[i]?.trim()) {
304
+
chunkVTT = lines.slice(i).join("\n");
307
+
console.log(`[VTTCleaner] Successfully processed chunk ${chunkIndex}`);
310
+
console.error(`[VTTCleaner] Exception in chunk ${chunkIndex}:`, error);
316
+
* Clean VTT text using AI to create paragraph-separated VTT file.
317
+
* Uses OpenRouter API to intelligently group segments into paragraphs
318
+
* while preserving timing information. Processes sequentially in chunks
319
+
* with context from previous chunks to maintain paragraph continuity.
321
+
export async function cleanVTT(
322
+
transcriptionId: string,
323
+
vttContent: string,
324
+
): Promise<string> {
325
+
const segments = parseVTT(vttContent);
327
+
if (segments.length === 0) {
332
+
`[VTTCleaner] Processing ${segments.length} segments for ${transcriptionId}`,
335
+
const apiKey = process.env.LLM_API_KEY;
336
+
const apiBaseUrl = process.env.LLM_API_BASE_URL;
337
+
const model = process.env.LLM_MODEL;
339
+
if (!apiKey || !apiBaseUrl || !model) {
340
+
console.warn("[VTTCleaner] LLM configuration incomplete (need LLM_API_KEY, LLM_API_BASE_URL, LLM_MODEL), returning uncleaned VTT");
345
+
// Build the input segments
346
+
const inputSegments = segments.map((seg, idx) => ({
348
+
timestamp: seg.timestamp,
352
+
// Prepare chunks for sequential processing
353
+
const chunks: Array<typeof inputSegments> = [];
354
+
for (let i = 0; i < inputSegments.length; i += CHUNK_SIZE) {
355
+
// Don't go beyond array bounds
356
+
const end = Math.min(i + CHUNK_SIZE, inputSegments.length);
357
+
chunks.push(inputSegments.slice(i, end));
360
+
console.log(`[VTTCleaner] Split into ${chunks.length} chunks for sequential processing with paragraph context`);
362
+
// Process chunks sequentially with context from previous chunk
363
+
const processedChunks: string[] = [];
364
+
let previousParagraphText: string | undefined;
365
+
let previousParagraphNumber: string | null = null;
367
+
for (let i = 0; i < chunks.length; i++) {
368
+
const chunk = chunks[i];
369
+
if (!chunk || chunk.length === 0) continue;
372
+
const processedChunk = await processVTTChunk(
376
+
previousParagraphNumber,
380
+
previousParagraphText
382
+
processedChunks.push(processedChunk);
383
+
console.log(`[VTTCleaner] Completed chunk ${i}/${chunks.length - 1}${previousParagraphText ? ' (with context)' : ''}`);
385
+
// Extract context for the next chunk
386
+
if (i < chunks.length - 1) {
387
+
const { segments: lastParagraphText, paragraphNumber, highestParagraphNumber } = extractLastParagraphAndHighestNumber(processedChunk);
389
+
if (lastParagraphText) {
390
+
console.log(`[VTTCleaner] Using paragraph ${paragraphNumber || 'unknown'} as context for next chunk (highest paragraph: ${highestParagraphNumber})`);
391
+
previousParagraphText = lastParagraphText;
392
+
previousParagraphNumber = highestParagraphNumber.toString();
394
+
previousParagraphText = undefined;
395
+
previousParagraphNumber = null;
399
+
console.error(`[VTTCleaner] Chunk ${i} failed:`, error);
400
+
// Return the original segments for this chunk if processing fails
401
+
const fallbackChunk = chunk.map(seg =>
402
+
`${seg.index || ''}\n${seg.timestamp}\n${seg.text}`
404
+
processedChunks.push(fallbackChunk);
405
+
previousParagraphText = undefined;
406
+
previousParagraphNumber = null;
410
+
// Combine all processed chunks
411
+
const finalVTT = `WEBVTT\n\n${processedChunks.join('\n\n')}`;
226
-
`[VTTCleaner] Successfully cleaned ${segments.length} segments using AI`,
414
+
`[VTTCleaner] Successfully cleaned ${segments.length} segments in ${chunks.length} sequential chunks with paragraph context`,
231
-
console.error("[VTTCleaner] Exception:", err);
419
+
console.error("[VTTCleaner] Exception:", error);
console.warn("[VTTCleaner] Falling back to uncleaned VTT");