commit 172063591fca7ab97fffa48accab3f53ab799cc7 · dunkirk.sh/thistle

+240 -52
src/lib/vtt-cleaner.ts
···

       103
       103
        
       }

     

       104
       104
        
       

     

       105
       105
        
       /**

     

       106
       106
       -
        * Clean VTT text using AI to create paragraph-separated VTT file.

     

       107
       107
       -
        * Uses OpenRouter API to intelligently group segments into paragraphs

     

       108
       108
       -
        * while preserving timing information.

     

       106
       106
       +
        * Chunk size for VTT processing

     

       109
       107
        
        */

     

       110
       110
       -
       export async function cleanVTT(

     

       111
       111
       -
       	transcriptionId: string,

     

       112
       112
       -
       	vttContent: string,

     

       113
       113
       -
       ): Promise<string> {

     

       114
       114
       -
       	const segments = parseVTT(vttContent);

     

       108
       108
       +
       const CHUNK_SIZE = 40; // Segments per chunk

     

       115
       109
        
       

     

       116
       116
       -
       	if (segments.length === 0) {

     

       117
       117
       -
       		return vttContent;

     

       110
       110
       +
       /**

     

       111
       111
       +
        * Find paragraph boundaries in processed VTT content

     

       112
       112
       +
        * Returns the segments in the last paragraph and highest paragraph number found

     

       113
       113
       +
        */

     

       114
       114
       +
       function extractLastParagraphAndHighestNumber(vttContent: string): { 

     

       115
       115
       +
       	segments: string, 

     

       116
       116
       +
       	paragraphNumber: string | null,

     

       117
       117
       +
       	highestParagraphNumber: number 

     

       118
       118
       +
       } {

     

       119
       119
       +
       	if (!vttContent) return { segments: '', paragraphNumber: null, highestParagraphNumber: 0 };

     

       120
       120
       +
       	

     

       121
       121
       +
       	// Split into segments (separated by double newline)

     

       122
       122
       +
       	const segments = vttContent.split('\n\n').filter(Boolean);

     

       123
       123
       +
       	if (segments.length === 0) return { segments: '', paragraphNumber: null, highestParagraphNumber: 0 };

     

       124
       124
       +
       	

     

       125
       125
       +
       	// Get all segments from the last paragraph number

     

       126
       126
       +
       	const lastSegments: string[] = [];

     

       127
       127
       +
       	let currentParagraphNumber: string | null = null;

     

       128
       128
       +
       	let highestParagraphNumber = 0;

     

       129
       129
       +
       	

     

       130
       130
       +
       	// First, scan through all segments to find the highest paragraph number

     

       131
       131
       +
       	for (const segment of segments) {

     

       132
       132
       +
       		if (!segment) continue;

     

       133
       133
       +
       		

     

       134
       134
       +
       		const lines = segment.split('\n');

     

       135
       135
       +
       		const firstLine = lines[0] || '';

     

       136
       136
       +
       		

     

       137
       137
       +
       		// Check for paragraph number pattern

     

       138
       138
       +
       		const paragraphMatch = /Paragraph (\d+)-\d+/.exec(firstLine);

     

       139
       139
       +
       		if (paragraphMatch?.[1]) {

     

       140
       140
       +
       			const paragraphNum = parseInt(paragraphMatch[1], 10);

     

       141
       141
       +
       			if (!Number.isNaN(paragraphNum) && paragraphNum > highestParagraphNumber) {

     

       142
       142
       +
       				highestParagraphNumber = paragraphNum;

     

       143
       143
       +
       			}

     

       144
       144
       +
       		}

     

       118
       145
        
       	}

     

       119
       119
       -
       

     

       120
       120
       -
       	console.log(

     

       121
       121
       -
       		`[VTTCleaner] Processing ${segments.length} segments for ${transcriptionId}`,

     

       122
       122
       -
       	);

     

       123
       123
       -
       

     

       124
       124
       -
       	const apiKey = process.env.LLM_API_KEY;

     

       125
       125
       -
       	const apiBaseUrl = process.env.LLM_API_BASE_URL;

     

       126
       126
       -
       	const model = process.env.LLM_MODEL;

     

       127
       146
        
       	

     

       128
       128
       -
       	if (!apiKey || !apiBaseUrl || !model) {

     

       129
       129
       -
       		console.warn("[VTTCleaner] LLM configuration incomplete (need LLM_API_KEY, LLM_API_BASE_URL, LLM_MODEL), returning uncleaned VTT");

     

       130
       130
       -
       		return vttContent;

     

       147
       147
       +
       	// Start from the end and work backwards to find the last paragraph

     

       148
       148
       +
       	for (let i = segments.length - 1; i >= 0; i--) {

     

       149
       149
       +
       		const segment = segments[i];

     

       150
       150
       +
       		if (!segment) continue;

     

       151
       151
       +
       		

     

       152
       152
       +
       		const lines = segment.split('\n');

     

       153
       153
       +
       		const firstLine = lines[0] || '';

     

       154
       154
       +
       		

     

       155
       155
       +
       		// Check for paragraph number pattern

     

       156
       156
       +
       		const paragraphMatch = /Paragraph (\d+)-\d+/.exec(firstLine);

     

       157
       157
       +
       		if (paragraphMatch?.[1]) {

     

       158
       158
       +
       			const paragraphNumber = paragraphMatch[1];

     

       159
       159
       +
       			

     

       160
       160
       +
       			if (!currentParagraphNumber) {

     

       161
       161
       +
       				// This is the first paragraph number we've found working backwards

     

       162
       162
       +
       				currentParagraphNumber = paragraphNumber;

     

       163
       163
       +
       				lastSegments.unshift(segment);

     

       164
       164
       +
       			} else if (paragraphNumber === currentParagraphNumber) {

     

       165
       165
       +
       				// Same paragraph, add it

     

       166
       166
       +
       				lastSegments.unshift(segment);

     

       167
       167
       +
       			} else {

     

       168
       168
       +
       				// Different paragraph, we're done

     

       169
       169
       +
       				break;

     

       170
       170
       +
       			}

     

       171
       171
       +
       		} else {

     

       172
       172
       +
       			// No paragraph number, but might be part of current paragraph

     

       173
       173
       +
       			// Add it if we've already started collecting segments

     

       174
       174
       +
       			if (currentParagraphNumber) {

     

       175
       175
       +
       				lastSegments.unshift(segment);

     

       176
       176
       +
       			}

     

       177
       177
       +
       		}

     

       131
       178
        
       	}

     

       179
       179
       +
       	

     

       180
       180
       +
       	return {

     

       181
       181
       +
       		segments: lastSegments.join('\n\n'),

     

       182
       182
       +
       		paragraphNumber: currentParagraphNumber,

     

       183
       183
       +
       		highestParagraphNumber

     

       184
       184
       +
       	};

     

       185
       185
       +
       }

     

       132
       186
        
       

     

       133
       133
       -
       	try {

     

       134
       134
       -
       		// Build the input for the AI

     

       135
       135
       -
       		const inputSegments = segments.map((seg, idx) => ({

     

       136
       136
       -
       			index: idx,

     

       137
       137
       -
       			timestamp: seg.timestamp,

     

       138
       138
       -
       			text: seg.text,

     

       139
       139
       -
       		}));

     

       140
       140
       -
       

     

       141
       141
       -
       		const prompt = `Can you turn this into a paragraph separated vtt file?

     

       187
       187
       +
       /**

     

       188
       188
       +
        * Process a chunk of VTT segments using AI

     

       189
       189
       +
        */

     

       190
       190
       +
       async function processVTTChunk(

     

       191
       191
       +
       	transcriptionId: string,

     

       192
       192
       +
       	inputSegments: Array<{index: number, timestamp: string, text: string}>,

     

       193
       193
       +
       	chunkIndex: number,

     

       194
       194
       +
       	previousParagraphNumber: string | null,

     

       195
       195
       +
       	apiKey: string,

     

       196
       196
       +
       	apiBaseUrl: string,

     

       197
       197
       +
       	model: string,

     

       198
       198
       +
       	previousParagraphText?: string,

     

       199
       199
       +
       ): Promise<string> {

     

       200
       200
       +
       	const chunkId = `${transcriptionId}-chunk${chunkIndex}`;

     

       201
       201
       +
       	

     

       202
       202
       +
       	const hasTextContext = !!previousParagraphText;

     

       203
       203
       +
       	

     

       204
       204
       +
       	console.log(`[VTTCleaner] Processing chunk ${chunkIndex} with ${inputSegments.length} segments${hasTextContext ? ' and previous paragraph text context' : ''}`);

     

       205
       205
       +
       	

     

       206
       206
       +
       	const nextParagraphNumber = previousParagraphNumber ? String(parseInt(previousParagraphNumber, 10) + 1) : '1';

     

       207
       207
       +
       	

     

       208
       208
       +
       	const prompt = `Can you turn this into a paragraph separated vtt file?

     

       142
       209
        
       

     

       143
       210
        
       Use the format "Paragraph X-Y" where X is the paragraph number and Y is the segment number within that paragraph:

     

       144
       211
        
       

     
···

       160
       227
        
       

     

       161
       228
        
       I want you to preserve sentences across paragraph breaks moving whatever is the smallest amount out to its own segment block.

     

       162
       229
        
       

     

       230
       230
       +
       Here are important guidelines for forming paragraphs:

     

       231
       231
       +
       1. Create a new paragraph when there's a change in topic or speaker.

     

       232
       232
       +
       2. Don't make paragraphs too long - aim for 4-5 sentences per paragraph maximum.

     

       233
       233
       +
       3. Group related thoughts together in the same paragraph.

     

       234
       234
       +
       4. Start a new paragraph when a sentence introduces a completely new idea.

     

       235
       235
       +
       5. Focus on the number of sentences, not segments, when creating paragraphs.

     

       236
       236
       +
       6. The number of segments in a paragraph may vary, but keep paragraphs to a reasonable length.

     

       237
       237
       +
       

     

       163
       238
        
       Also go through and rewrite the words to extract the meaning and not necessarily the exact phrasing if it sounds unnatural when written. I want the text to remain lined up with the original though so don't rewrite entire paragraphs but you can remove ums, alrights, and similar. Also remove all contextual tags like [background noise]. Add punctuation if it's missing to make the text readable. If there is no more context to fit a segment then just skip it and move to the next one.

     

       164
       239
        
       

     

       165
       165
       -
       Input segments:

     

       240
       240
       +
       ${hasTextContext ? 

     

       241
       241
       +
       `The following is the last paragraph from the previous chunk and is provided for context only. DO NOT include it in your output - it's already in the transcript:

     

       242
       242
       +
       

     

       243
       243
       +
       ${previousParagraphText}

     

       244
       244
       +
       

     

       245
       245
       +
       Now process the following new segments, continuing from the previous paragraph. ${previousParagraphNumber ? `Start your paragraphs with number ${nextParagraphNumber} (unless you're continuing the previous paragraph).` : ''}` 

     

       246
       246
       +
       : 'Process the following segments:'}

     

       247
       247
       +
       

     

       166
       248
        
       ${JSON.stringify(inputSegments, null, 2)}

     

       167
       249
        
       

     

       168
       168
       -
       Return ONLY the VTT content starting with "WEBVTT" and nothing else. No explanations or additional text.`;

     

       250
       250
       +
       Return ONLY the VTT content WITHOUT the "WEBVTT" header and nothing else. No explanations or additional text.`;

     

       169
       251
        
       

     

       252
       252
       +
       	try {

     

       170
       253
        
       		const response = await fetch(

     

       171
       254
        
       			`${apiBaseUrl}/chat/completions`,

     

       172
       255
        
       			{

     
···

       175
       258
        
       					"Content-Type": "application/json",

     

       176
       259
        
       					"Authorization": `Bearer ${apiKey}`,

     

       177
       260
        
       					"HTTP-Referer": "https://thistle.app",

     

       178
       178
       -
       					"X-Title": "Thistle Transcription",

     

       261
       261
       +
       					"X-Title": `Thistle Transcription Chunk ${chunkIndex}`,

     

       179
       262
        
       				},

     

       180
       263
        
       				body: JSON.stringify({

     

       181
       264
        
       					model,

     
···

       183
       266
        
       						{ role: "user", content: prompt },

     

       184
       267
        
       					],

     

       185
       268
        
       					temperature: 0.3,

     

       186
       186
       -
       					max_tokens: 16384,

     

       269
       269
       +
       					max_tokens: 8192, // Reduced for chunks

     

       187
       270
        
       				}),

     

       188
       271
        
       			},

     

       189
       272
        
       		);

     

       190
       273
        
       

     

       191
       274
        
       		if (!response.ok) {

     

       192
       275
        
       			const errorText = await response.text();

     

       193
       193
       -
       			console.error(`[VTTCleaner] OpenRouter error for ${transcriptionId}:`, errorText);

     

       194
       194
       -
       			console.warn("[VTTCleaner] Falling back to uncleaned VTT");

     

       195
       195
       -
       			return vttContent;

     

       276
       276
       +
       			console.error(`[VTTCleaner] OpenRouter error for ${chunkId}:`, errorText);

     

       277
       277
       +
       			throw new Error(`API error: ${response.status}`);

     

       196
       278
        
       		}

     

       197
       279
        
       

     

       198
       280
        
       		const result = await response.json();

     

       199
       281
        
       		const cleanedVTT = result.choices?.[0]?.message?.content?.trim();

     

       200
       282
        
       

     

       201
       283
        
       		if (!cleanedVTT) {

     

       202
       202
       -
       			console.warn("[VTTCleaner] Empty response from AI, returning uncleaned VTT");

     

       203
       203
       -
       			return vttContent;

     

       284
       284
       +
       			throw new Error("Empty response from AI");

     

       204
       285
        
       		}

     

       205
       286
        
       

     

       206
       287
        
       		// Extract VTT content if the model wrapped it in markdown

     

       207
       207
       -
       		let finalVTT = cleanedVTT;

     

       288
       288
       +
       		let chunkVTT = cleanedVTT;

     

       208
       289
        
       		if (cleanedVTT.includes("```")) {

     

       209
       290
        
       			const vttMatch = cleanedVTT.match(/```(?:vtt)?\n([\s\S]*?)```/);

     

       210
       291
        
       			if (vttMatch?.[1]) {

     

       211
       211
       -
       				finalVTT = vttMatch[1].trim();

     

       292
       292
       +
       				chunkVTT = vttMatch[1].trim();

     

       212
       293
        
       			}

     

       213
       294
        
       		}

     

       214
       295
        
       

     

       215
       215
       -
       		// Ensure it starts with WEBVTT

     

       216
       216
       -
       		if (!finalVTT.startsWith("WEBVTT")) {

     

       217
       217
       -
       			const webvttIndex = finalVTT.indexOf("WEBVTT");

     

       218
       218
       -
       			if (webvttIndex !== -1) {

     

       219
       219
       -
       				finalVTT = finalVTT.substring(webvttIndex);

     

       220
       220
       -
       			} else {

     

       221
       221
       -
       				finalVTT = `WEBVTT\n\n${finalVTT}`;

     

       296
       296
       +
       		// Remove WEBVTT header if present (we'll add it once at the end)

     

       297
       297
       +
       		if (chunkVTT.startsWith("WEBVTT")) {

     

       298
       298
       +
       			const lines = chunkVTT.split("\n");

     

       299
       299
       +
       			// Skip WEBVTT line and any blank lines that follow

     

       300
       300
       +
       			let i = 1;

     

       301
       301
       +
       			while (i < lines.length && !lines[i]?.trim()) {

     

       302
       302
       +
       				i++;

     

       222
       303
        
       			}

     

       304
       304
       +
       			chunkVTT = lines.slice(i).join("\n");

     

       223
       305
        
       		}

     

       224
       306
        
       

     

       307
       307
       +
       		console.log(`[VTTCleaner] Successfully processed chunk ${chunkIndex}`);

     

       308
       308
       +
       		return chunkVTT;

     

       309
       309
       +
       	} catch (error) {

     

       310
       310
       +
       		console.error(`[VTTCleaner] Exception in chunk ${chunkIndex}:`, error);

     

       311
       311
       +
       		throw error;

     

       312
       312
       +
       	}

     

       313
       313
       +
       }

     

       314
       314
       +
       

     

       315
       315
       +
       /**

     

       316
       316
       +
        * Clean VTT text using AI to create paragraph-separated VTT file.

     

       317
       317
       +
        * Uses OpenRouter API to intelligently group segments into paragraphs

     

       318
       318
       +
        * while preserving timing information. Processes sequentially in chunks

     

       319
       319
       +
        * with context from previous chunks to maintain paragraph continuity.

     

       320
       320
       +
        */

     

       321
       321
       +
       export async function cleanVTT(

     

       322
       322
       +
       	transcriptionId: string,

     

       323
       323
       +
       	vttContent: string,

     

       324
       324
       +
       ): Promise<string> {

     

       325
       325
       +
       	const segments = parseVTT(vttContent);

     

       326
       326
       +
       

     

       327
       327
       +
       	if (segments.length === 0) {

     

       328
       328
       +
       		return vttContent;

     

       329
       329
       +
       	}

     

       330
       330
       +
       

     

       331
       331
       +
       	console.log(

     

       332
       332
       +
       		`[VTTCleaner] Processing ${segments.length} segments for ${transcriptionId}`,

     

       333
       333
       +
       	);

     

       334
       334
       +
       

     

       335
       335
       +
       	const apiKey = process.env.LLM_API_KEY;

     

       336
       336
       +
       	const apiBaseUrl = process.env.LLM_API_BASE_URL;

     

       337
       337
       +
       	const model = process.env.LLM_MODEL;

     

       338
       338
       +
       	

     

       339
       339
       +
       	if (!apiKey || !apiBaseUrl || !model) {

     

       340
       340
       +
       		console.warn("[VTTCleaner] LLM configuration incomplete (need LLM_API_KEY, LLM_API_BASE_URL, LLM_MODEL), returning uncleaned VTT");

     

       341
       341
       +
       		return vttContent;

     

       342
       342
       +
       	}

     

       343
       343
       +
       

     

       344
       344
       +
       	try {

     

       345
       345
       +
       		// Build the input segments

     

       346
       346
       +
       		const inputSegments = segments.map((seg, idx) => ({

     

       347
       347
       +
       			index: idx,

     

       348
       348
       +
       			timestamp: seg.timestamp,

     

       349
       349
       +
       			text: seg.text,

     

       350
       350
       +
       		}));

     

       351
       351
       +
       

     

       352
       352
       +
       		// Prepare chunks for sequential processing

     

       353
       353
       +
       		const chunks: Array<typeof inputSegments> = [];

     

       354
       354
       +
       		for (let i = 0; i < inputSegments.length; i += CHUNK_SIZE) {

     

       355
       355
       +
       			// Don't go beyond array bounds

     

       356
       356
       +
       			const end = Math.min(i + CHUNK_SIZE, inputSegments.length);

     

       357
       357
       +
       			chunks.push(inputSegments.slice(i, end));

     

       358
       358
       +
       		}

     

       359
       359
       +
       		

     

       360
       360
       +
       		console.log(`[VTTCleaner] Split into ${chunks.length} chunks for sequential processing with paragraph context`);

     

       361
       361
       +
       		

     

       362
       362
       +
       		// Process chunks sequentially with context from previous chunk

     

       363
       363
       +
       		const processedChunks: string[] = [];

     

       364
       364
       +
       		let previousParagraphText: string | undefined;

     

       365
       365
       +
       		let previousParagraphNumber: string | null = null;

     

       366
       366
       +
       		

     

       367
       367
       +
       		for (let i = 0; i < chunks.length; i++) {

     

       368
       368
       +
       			const chunk = chunks[i];

     

       369
       369
       +
       			if (!chunk || chunk.length === 0) continue;

     

       370
       370
       +
       			

     

       371
       371
       +
       			try {

     

       372
       372
       +
       				const processedChunk = await processVTTChunk(

     

       373
       373
       +
       					transcriptionId, 

     

       374
       374
       +
       					chunk, 

     

       375
       375
       +
       					i,

     

       376
       376
       +
       					previousParagraphNumber,

     

       377
       377
       +
       					apiKey, 

     

       378
       378
       +
       					apiBaseUrl, 

     

       379
       379
       +
       					model,

     

       380
       380
       +
       					previousParagraphText

     

       381
       381
       +
       				);

     

       382
       382
       +
       				processedChunks.push(processedChunk);

     

       383
       383
       +
       				console.log(`[VTTCleaner] Completed chunk ${i}/${chunks.length - 1}${previousParagraphText ? ' (with context)' : ''}`);

     

       384
       384
       +
       				

     

       385
       385
       +
       				// Extract context for the next chunk

     

       386
       386
       +
       				if (i < chunks.length - 1) {

     

       387
       387
       +
       					const { segments: lastParagraphText, paragraphNumber, highestParagraphNumber } = extractLastParagraphAndHighestNumber(processedChunk);

     

       388
       388
       +
       					

     

       389
       389
       +
       					if (lastParagraphText) {

     

       390
       390
       +
       						console.log(`[VTTCleaner] Using paragraph ${paragraphNumber || 'unknown'} as context for next chunk (highest paragraph: ${highestParagraphNumber})`);

     

       391
       391
       +
       						previousParagraphText = lastParagraphText;

     

       392
       392
       +
       						previousParagraphNumber = highestParagraphNumber.toString();

     

       393
       393
       +
       					} else {

     

       394
       394
       +
       						previousParagraphText = undefined;

     

       395
       395
       +
       						previousParagraphNumber = null;

     

       396
       396
       +
       					}

     

       397
       397
       +
       				}

     

       398
       398
       +
       			} catch (error) {

     

       399
       399
       +
       				console.error(`[VTTCleaner] Chunk ${i} failed:`, error);

     

       400
       400
       +
       				// Return the original segments for this chunk if processing fails

     

       401
       401
       +
       				const fallbackChunk = chunk.map(seg => 

     

       402
       402
       +
       					`${seg.index || ''}\n${seg.timestamp}\n${seg.text}`

     

       403
       403
       +
       				).join('\n\n');

     

       404
       404
       +
       				processedChunks.push(fallbackChunk);

     

       405
       405
       +
       				previousParagraphText = undefined;

     

       406
       406
       +
       				previousParagraphNumber = null;

     

       407
       407
       +
       			}

     

       408
       408
       +
       		}

     

       409
       409
       +
       		

     

       410
       410
       +
       		// Combine all processed chunks

     

       411
       411
       +
       		const finalVTT = `WEBVTT\n\n${processedChunks.join('\n\n')}`;

     

       412
       412
       +
       		

     

       225
       413
        
       		console.log(

     

       226
       226
       -
       			`[VTTCleaner] Successfully cleaned ${segments.length} segments using AI`,

     

       414
       414
       +
       			`[VTTCleaner] Successfully cleaned ${segments.length} segments in ${chunks.length} sequential chunks with paragraph context`,

     

       227
       415
        
       		);

     

       228
       416
        
       

     

       229
       417
        
       		return finalVTT;

     

       230
       230
       -
       	} catch (err) {

     

       231
       231
       -
       		console.error("[VTTCleaner] Exception:", err);

     

       418
       418
       +
       	} catch (error) {

     

       419
       419
       +
       		console.error("[VTTCleaner] Exception:", error);

     

       232
       420
        
       		console.warn("[VTTCleaner] Falling back to uncleaned VTT");

     

       233
       421
        
       		return vttContent;

     

       234
       422
        
       	}

     

       235
       235
       -
       }

     

       423
       423
       +
       }