commit a88b28130dcae1001ebdfef5c87a83c73279809e · dunkirk.sh/thistle

.env.example

···

       2
        
       # URL of the faster-whisper transcription server

     

       3
        
       # See README for setup instructions

     

       4
        
       WHISPER_SERVICE_URL=http://localhost:8000

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0

···

       2
        
       # URL of the faster-whisper transcription server

     

       3
        
       # See README for setup instructions

     

       4
        
       WHISPER_SERVICE_URL=http://localhost:8000

     

       5
       +
       

     

       6
       +
       # Gemini API Key (optional)

     

       7
       +
       # For cleaning transcripts - removes tags and improves grammar

     

       8
       +
       # Get your key from: https://aistudio.google.com/app/apikey

     

       9
       +
       # GEMINI_API_KEY=your_api_key_here

+53

src/lib/transcript-cleaner.test.ts

···

       1
       +
       import { test, expect } from "bun:test";

     

       2
       +
       import { cleanTranscript } from "./transcript-cleaner";

     

       3
       +
       

     

       4
       +
       test("cleanTranscript removes tags and fixes grammar", async () => {

     

       5
       +
       	const rawTranscript = `[SIDE CONVERSATION] Yes? So with this course packet, what quiz is and exams, and if I can study through here, what you talk about? And I give you a good review every time. Yeah, so I'd be good to just study that and then we can do it. Yeah, and all the examples and stuff that we get from class especially. And then I, like your first quiz, I give you a mock quiz exactly like the quiz. Oh, okay. so you can kind of get a feel for how I do things. [inaudible] Okay? [inaudible] Yeah. [background chatter]`;

     

       6
       +
       

     

       7
       +
       	const result = await cleanTranscript({

     

       8
       +
       		transcriptId: "test-123",

     

       9
       +
       		rawTranscript,

     

       10
       +
       	});

     

       11
       +
       

     

       12
       +
       	// Check that tags are removed

     

       13
       +
       	expect(result.cleanedTranscript).not.toContain("[SIDE CONVERSATION]");

     

       14
       +
       	expect(result.cleanedTranscript).not.toContain("[inaudible]");

     

       15
       +
       	expect(result.cleanedTranscript).not.toContain("[background chatter]");

     

       16
       +
       

     

       17
       +
       	// Check that we got some text back

     

       18
       +
       	expect(result.cleanedTranscript.length).toBeGreaterThan(0);

     

       19
       +
       	expect(result.cleanedTranscript.length).toBeLessThan(rawTranscript.length);

     

       20
       +
       

     

       21
       +
       	console.log("Original:", rawTranscript.substring(0, 100));

     

       22
       +
       	console.log("Cleaned:", result.cleanedTranscript.substring(0, 100));

     

       23
       +
       }, 30000); // 30s timeout for API call

     

       24
       +
       

     

       25
       +
       test("cleanTranscript handles empty transcript", async () => {

     

       26
       +
       	const result = await cleanTranscript({

     

       27
       +
       		transcriptId: "test-empty",

     

       28
       +
       		rawTranscript: "",

     

       29
       +
       	});

     

       30
       +
       

     

       31
       +
       	expect(result.cleanedTranscript).toBe("");

     

       32
       +
       });

     

       33
       +
       

     

       34
       +
       test("cleanTranscript falls back to raw transcript on API error", async () => {

     

       35
       +
       	const rawTranscript = "Test transcript";

     

       36
       +
       

     

       37
       +
       	// Test with missing API key (if it's actually set, this test might fail)

     

       38
       +
       	const originalKey = process.env.GEMINI_API_KEY;

     

       39
       +
       	delete process.env.GEMINI_API_KEY;

     

       40
       +
       

     

       41
       +
       	const result = await cleanTranscript({

     

       42
       +
       		transcriptId: "test-fallback",

     

       43
       +
       		rawTranscript,

     

       44
       +
       	});

     

       45
       +
       

     

       46
       +
       	expect(result.cleanedTranscript).toBe(rawTranscript);

     

       47
       +
       	expect(result.error).toBe("GEMINI_API_KEY not set");

     

       48
       +
       

     

       49
       +
       	// Restore key

     

       50
       +
       	if (originalKey) {

     

       51
       +
       		process.env.GEMINI_API_KEY = originalKey;

     

       52
       +
       	}

     

       53
       +
       });

+127

src/lib/transcript-cleaner.ts

···

       1
       +
       // Clean up transcripts using Gemini to remove tags and fix grammar

     

       2
       +
       

     

       3
       +
       interface CleanTranscriptOptions {

     

       4
       +
       	transcriptId: string;

     

       5
       +
       	rawTranscript: string;

     

       6
       +
       }

     

       7
       +
       

     

       8
       +
       interface CleanTranscriptResult {

     

       9
       +
       	cleanedTranscript: string;

     

       10
       +
       	error?: string;

     

       11
       +
       }

     

       12
       +
       

     

       13
       +
       /**

     

       14
       +
        * Clean transcript using Gemini Flash 2.0 (cheapest model)

     

       15
       +
        * Removes tags like [SIDE CONVERSATION], [inaudible], etc.

     

       16
       +
        * Fixes grammar while preserving sentence structure

     

       17
       +
        */

     

       18
       +
       export async function cleanTranscript({

     

       19
       +
       	transcriptId,

     

       20
       +
       	rawTranscript,

     

       21
       +
       }: CleanTranscriptOptions): Promise<CleanTranscriptResult> {

     

       22
       +
       	const apiKey = process.env.GEMINI_API_KEY;

     

       23
       +
       

     

       24
       +
       	if (!apiKey) {

     

       25
       +
       		return {

     

       26
       +
       			cleanedTranscript: rawTranscript,

     

       27
       +
       			error: "GEMINI_API_KEY not set",

     

       28
       +
       		};

     

       29
       +
       	}

     

       30
       +
       

     

       31
       +
       	// Skip cleaning if transcript is empty

     

       32
       +
       	if (!rawTranscript || rawTranscript.trim().length === 0) {

     

       33
       +
       		return {

     

       34
       +
       			cleanedTranscript: rawTranscript,

     

       35
       +
       		};

     

       36
       +
       	}

     

       37
       +
       

     

       38
       +
       	console.log(

     

       39
       +
       		`[TranscriptCleaner] Starting cleanup for ${transcriptId} (${rawTranscript.length} chars)`,

     

       40
       +
       	);

     

       41
       +
       

     

       42
       +
       	try {

     

       43
       +
       		const prompt = `You are a transcript editor. Clean up this transcript by:

     

       44
       +
       1. Removing ALL tags like [SIDE CONVERSATION], [inaudible], [background chatter], etc.

     

       45
       +
       2. Fixing grammar and punctuation to make sentences readable

     

       46
       +
       3. Preserving the original sentence structure and wording as much as possible

     

       47
       +
       4. Fixing obvious speech recognition errors (e.g., "gr..." should be "grade")

     

       48
       +
       5. NOT adding any new content or changing the meaning

     

       49
       +
       6. If there are obvious speaking mistakes then you can fix those (e.g. "we are going no wait sorry you should be doing")

     

       50
       +
       

     

       51
       +
       Return ONLY the cleaned transcript text, nothing else.

     

       52
       +
       

     

       53
       +
       Transcript to clean:

     

       54
       +
       ${rawTranscript}`;

     

       55
       +
       

     

       56
       +
       		const response = await fetch(

     

       57
       +
       			"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-exp:generateContent",

     

       58
       +
       			{

     

       59
       +
       				method: "POST",

     

       60
       +
       				headers: {

     

       61
       +
       					"Content-Type": "application/json",

     

       62
       +
       					"x-goog-api-key": apiKey,

     

       63
       +
       				},

     

       64
       +
       				body: JSON.stringify({

     

       65
       +
       					contents: [

     

       66
       +
       						{

     

       67
       +
       							parts: [{ text: prompt }],

     

       68
       +
       						},

     

       69
       +
       					],

     

       70
       +
       					generationConfig: {

     

       71
       +
       						temperature: 0.3,

     

       72
       +
       						topK: 40,

     

       73
       +
       						topP: 0.95,

     

       74
       +
       						maxOutputTokens: 8192,

     

       75
       +
       					},

     

       76
       +
       				}),

     

       77
       +
       			},

     

       78
       +
       		);

     

       79
       +
       

     

       80
       +
       		if (!response.ok) {

     

       81
       +
       			const errorText = await response.text();

     

       82
       +
       			console.error(

     

       83
       +
       				`[TranscriptCleaner] Gemini API error for ${transcriptId}:`,

     

       84
       +
       				errorText,

     

       85
       +
       			);

     

       86
       +
       			return {

     

       87
       +
       				cleanedTranscript: rawTranscript,

     

       88
       +
       				error: `Gemini API error: ${response.status}`,

     

       89
       +
       			};

     

       90
       +
       		}

     

       91
       +
       

     

       92
       +
       		const result = await response.json();

     

       93
       +
       		const cleanedText =

     

       94
       +
       			result.candidates?.[0]?.content?.parts?.[0]?.text?.trim();

     

       95
       +
       

     

       96
       +
       		if (!cleanedText) {

     

       97
       +
       			console.warn(

     

       98
       +
       				`[TranscriptCleaner] Empty response from Gemini for ${transcriptId}`,

     

       99
       +
       			);

     

       100
       +
       			return {

     

       101
       +
       				cleanedTranscript: rawTranscript,

     

       102
       +
       				error: "Empty response from Gemini",

     

       103
       +
       			};

     

       104
       +
       		}

     

       105
       +
       

     

       106
       +
       		const reduction = Math.round(

     

       107
       +
       			((rawTranscript.length - cleanedText.length) / rawTranscript.length) *

     

       108
       +
       				100,

     

       109
       +
       		);

     

       110
       +
       		console.log(

     

       111
       +
       			`[TranscriptCleaner] Completed for ${transcriptId}: ${rawTranscript.length} → ${cleanedText.length} chars (${reduction}% reduction)`,

     

       112
       +
       		);

     

       113
       +
       

     

       114
       +
       		return {

     

       115
       +
       			cleanedTranscript: cleanedText,

     

       116
       +
       		};

     

       117
       +
       	} catch (error) {

     

       118
       +
       		console.error(

     

       119
       +
       			`[TranscriptCleaner] Failed to clean ${transcriptId}:`,

     

       120
       +
       			error,

     

       121
       +
       		);

     

       122
       +
       		return {

     

       123
       +
       			cleanedTranscript: rawTranscript,

     

       124
       +
       			error: error instanceof Error ? error.message : "Unknown error",

     

       125
       +
       		};

     

       126
       +
       	}

     

       127
       +
       }

+6 -25

src/lib/transcription.ts

···

       1
        
       import type { Database } from "bun:sqlite";

     

       2
        
       import { createEventSource } from "eventsource-client";

     

       3
        
       import { ErrorCode } from "./errors";

     

       4
       -
       import { saveTranscript, saveTranscriptVTT } from "./transcript-storage";

     

       0
        
       
     

       5
        
       

     

       6
        
       // Constants

     

       7
        
       export const MAX_FILE_SIZE = 100 * 1024 * 1024; // 100MB

     
···

       275
        
       			let transcript = update.transcript ?? "";

     

       276
        
       			transcript = transcript.replace(/<\|[^|]+\|>/g, "").trim();

     

       277
        
       

     

       278
       -
       			// Save transcript to file (overwrites on each update)

     

       279
       -
       			if (transcript) {

     

       280
       -
       				await saveTranscript(transcriptionId, transcript);

     

       281
       -
       			}

     

       282
       -
       

     

       283
        
       			this.updateTranscription(transcriptionId, {

     

       284
        
       				status,

     

       285
        
       				progress,

     
···

       291
        
       				transcript: transcript || undefined,

     

       292
        
       			});

     

       293
        
       		} else if (update.status === "completed") {

     

       294
       -
       			// Final transcript should already have tokens stripped by Murmur

     

       295
       -
       			const transcript = update.transcript ?? "";

     

       296
       -
       

     

       297
       -
       			// Save final transcript to file

     

       298
       -
       			if (transcript) {

     

       299
       -
       				await saveTranscript(transcriptionId, transcript);

     

       300
       -
       			}

     

       301
       -
       

     

       302
        
       			// Fetch and save VTT file from Murmur

     

       303
        
       			const whisperJobId = this.db

     

       304
        
       				.query<{ whisper_job_id: string }, [string]>(

     
···

       313
        
       					);

     

       314
        
       					if (vttResponse.ok) {

     

       315
        
       						const vttContent = await vttResponse.text();

     

       316
       -
       						await saveTranscriptVTT(transcriptionId, vttContent);

     

       0
        
       
     

       317
        
       					}

     

       318
        
       				} catch (error) {

     

       319
        
       					console.warn(

     
···

       331
        
       			this.events.emit(transcriptionId, {

     

       332
        
       				status: "completed",

     

       333
        
       				progress: 100,

     

       334
       -
       				transcript,

     

       335
        
       			});

     

       336
        
       

     

       337
        
       			// Close stream - keep audio file for playback

     
···

       525
        
       			if (!details) return;

     

       526
        
       

     

       527
        
       			if (details.status === "completed") {

     

       528
       -
       				const transcript = details.transcript ?? "";

     

       529
       -
       

     

       530
       -
       				// Save transcript to file

     

       531
       -
       				if (transcript) {

     

       532
       -
       					await saveTranscript(transcriptionId, transcript);

     

       533
       -
       				}

     

       534
       -
       

     

       535
        
       				// Fetch and save VTT file

     

       536
        
       				try {

     

       537
        
       					const vttResponse = await fetch(

     
···

       539
        
       					);

     

       540
        
       					if (vttResponse.ok) {

     

       541
        
       						const vttContent = await vttResponse.text();

     

       542
       -
       						await saveTranscriptVTT(transcriptionId, vttContent);

     

       0
        
       
     

       543
        
       					}

     

       544
        
       				} catch (error) {

     

       545
        
       					console.warn(

     
···

       556
        
       				this.events.emit(transcriptionId, {

     

       557
        
       					status: "completed",

     

       558
        
       					progress: 100,

     

       559
       -
       					transcript,

     

       560
        
       				});

     

       561
        
       			} else if (details.status === "failed") {

     

       562
        
       				const errorMessage = (

···

       1
        
       import type { Database } from "bun:sqlite";

     

       2
        
       import { createEventSource } from "eventsource-client";

     

       3
        
       import { ErrorCode } from "./errors";

     

       4
       +
       import { saveTranscriptVTT } from "./transcript-storage";

     

       5
       +
       import { cleanVTT } from "./vtt-cleaner";

     

       6
        
       

     

       7
        
       // Constants

     

       8
        
       export const MAX_FILE_SIZE = 100 * 1024 * 1024; // 100MB

     
···

       276
        
       			let transcript = update.transcript ?? "";

     

       277
        
       			transcript = transcript.replace(/<\|[^|]+\|>/g, "").trim();

     

       278
        
       

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       279
        
       			this.updateTranscription(transcriptionId, {

     

       280
        
       				status,

     

       281
        
       				progress,

     
···

       287
        
       				transcript: transcript || undefined,

     

       288
        
       			});

     

       289
        
       		} else if (update.status === "completed") {

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       290
        
       			// Fetch and save VTT file from Murmur

     

       291
        
       			const whisperJobId = this.db

     

       292
        
       				.query<{ whisper_job_id: string }, [string]>(

     
···

       301
        
       					);

     

       302
        
       					if (vttResponse.ok) {

     

       303
        
       						const vttContent = await vttResponse.text();

     

       304
       +
       						const cleanedVTT = await cleanVTT(transcriptionId, vttContent);

     

       305
       +
       						await saveTranscriptVTT(transcriptionId, cleanedVTT);

     

       306
        
       					}

     

       307
        
       				} catch (error) {

     

       308
        
       					console.warn(

     
···

       320
        
       			this.events.emit(transcriptionId, {

     

       321
        
       				status: "completed",

     

       322
        
       				progress: 100,

     

       0
        
       
     

       323
        
       			});

     

       324
        
       

     

       325
        
       			// Close stream - keep audio file for playback

     
···

       513
        
       			if (!details) return;

     

       514
        
       

     

       515
        
       			if (details.status === "completed") {

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       516
        
       				// Fetch and save VTT file

     

       517
        
       				try {

     

       518
        
       					const vttResponse = await fetch(

     
···

       520
        
       					);

     

       521
        
       					if (vttResponse.ok) {

     

       522
        
       						const vttContent = await vttResponse.text();

     

       523
       +
       						const cleanedVTT = await cleanVTT(transcriptionId, vttContent);

     

       524
       +
       						await saveTranscriptVTT(transcriptionId, cleanedVTT);

     

       525
        
       					}

     

       526
        
       				} catch (error) {

     

       527
        
       					console.warn(

     
···

       538
        
       				this.events.emit(transcriptionId, {

     

       539
        
       					status: "completed",

     

       540
        
       					progress: 100,

     

       0
        
       
     

       541
        
       				});

     

       542
        
       			} else if (details.status === "failed") {

     

       543
        
       				const errorMessage = (

+38

src/lib/vtt-cleaner.test.ts

···

       1
       +
       import { test, expect } from "bun:test";

     

       2
       +
       import { cleanVTT } from "./vtt-cleaner";

     

       3
       +
       

     

       4
       +
       const sampleVTT = `WEBVTT

     

       5
       +
       

     

       6
       +
       00:00:00.000 --> 00:00:03.480

     

       7
       +
       <|startoftranscript|> [SIDE CONVERSATION]<|endoftext|>

     

       8
       +
       

     

       9
       +
       00:00:00.000 --> 00:00:00.000

     

       10
       +
       <|startoftranscript|> Yes?

     

       11
       +
       

     

       12
       +
       00:00:00.000 --> 00:00:00.000

     

       13
       +
       So with this course packet, what quiz is and exams, and if I can study through here, what you talk about?

     

       14
       +
       

     

       15
       +
       00:00:00.000 --> 00:00:00.000

     

       16
       +
       And I give you a good review every time.

     

       17
       +
       

     

       18
       +
       00:00:00.000 --> 00:00:00.000

     

       19
       +
       Yeah, so I'd be good to just study that and then we can do it.`;

     

       20
       +
       

     

       21
       +
       test("cleanVTT removes tags and cleans text", async () => {

     

       22
       +
       	const result = await cleanVTT("test-vtt", sampleVTT);

     

       23
       +
       

     

       24
       +
       	expect(result).toContain("WEBVTT");

     

       25
       +
       	expect(result).not.toContain("[SIDE CONVERSATION]");

     

       26
       +
       	expect(result).not.toContain("<|startoftranscript|>");

     

       27
       +
       	expect(result).not.toContain("<|endoftext|>");

     

       28
       +
       	expect(result).toContain("-->");

     

       29
       +
       

     

       30
       +
       	console.log("Cleaned VTT preview:", result.substring(0, 200));

     

       31
       +
       }, 30000);

     

       32
       +
       

     

       33
       +
       test("cleanVTT preserves empty VTT", async () => {

     

       34
       +
       	const emptyVTT = "WEBVTT\n\n";

     

       35
       +
       	const result = await cleanVTT("test-empty", emptyVTT);

     

       36
       +
       

     

       37
       +
       	expect(result).toBe(emptyVTT);

     

       38
       +
       });

+125

src/lib/vtt-cleaner.ts

···

       1
       +
       // Parse and clean VTT files

     

       2
       +
       

     

       3
       +
       import { cleanTranscript } from "./transcript-cleaner";

     

       4
       +
       

     

       5
       +
       interface VTTSegment {

     

       6
       +
       	index?: number;

     

       7
       +
       	timestamp: string;

     

       8
       +
       	text: string;

     

       9
       +
       }

     

       10
       +
       

     

       11
       +
       /**

     

       12
       +
        * Parse VTT content into segments

     

       13
       +
        */

     

       14
       +
       function parseVTT(vttContent: string): VTTSegment[] {

     

       15
       +
       	const lines = vttContent.split("\n");

     

       16
       +
       	const segments: VTTSegment[] = [];

     

       17
       +
       	let currentSegment: Partial<VTTSegment> = {};

     

       18
       +
       

     

       19
       +
       	for (let i = 0; i < lines.length; i++) {

     

       20
       +
       		const line = lines[i]?.trim();

     

       21
       +
       

     

       22
       +
       		if (!line) {

     

       23
       +
       			if (currentSegment.timestamp && currentSegment.text) {

     

       24
       +
       				segments.push(currentSegment as VTTSegment);

     

       25
       +
       				currentSegment = {};

     

       26
       +
       			}

     

       27
       +
       			continue;

     

       28
       +
       		}

     

       29
       +
       

     

       30
       +
       		if (line === "WEBVTT") {

     

       31
       +
       			continue;

     

       32
       +
       		}

     

       33
       +
       

     

       34
       +
       		// Check if it's a timestamp line

     

       35
       +
       		if (line.includes("-->")) {

     

       36
       +
       			currentSegment.timestamp = line;

     

       37
       +
       			// Next line(s) will be text

     

       38
       +
       			const textLines: string[] = [];

     

       39
       +
       			i++;

     

       40
       +
       			while (i < lines.length && lines[i]?.trim() && !lines[i]?.includes("-->")) {

     

       41
       +
       				textLines.push(lines[i] || "");

     

       42
       +
       				i++;

     

       43
       +
       			}

     

       44
       +
       			currentSegment.text = textLines.join("\n").trim();

     

       45
       +
       			i--; // Back up one since the loop will increment

     

       46
       +
       		} else if (/^\d+$/.test(line)) {

     

       47
       +
       			// It's an index number

     

       48
       +
       			currentSegment.index = Number.parseInt(line, 10);

     

       49
       +
       		}

     

       50
       +
       	}

     

       51
       +
       

     

       52
       +
       	// Add last segment if exists

     

       53
       +
       	if (currentSegment.timestamp && currentSegment.text) {

     

       54
       +
       		segments.push(currentSegment as VTTSegment);

     

       55
       +
       	}

     

       56
       +
       

     

       57
       +
       	return segments;

     

       58
       +
       }

     

       59
       +
       

     

       60
       +
       /**

     

       61
       +
        * Clean VTT text segments by removing tags and fixing grammar

     

       62
       +
        */

     

       63
       +
       export async function cleanVTT(

     

       64
       +
       	transcriptionId: string,

     

       65
       +
       	vttContent: string,

     

       66
       +
       ): Promise<string> {

     

       67
       +
       	const segments = parseVTT(vttContent);

     

       68
       +
       

     

       69
       +
       	if (segments.length === 0) {

     

       70
       +
       		return vttContent;

     

       71
       +
       	}

     

       72
       +
       

     

       73
       +
       	console.log(

     

       74
       +
       		`[VTTCleaner] Cleaning ${segments.length} segments for ${transcriptionId}`,

     

       75
       +
       	);

     

       76
       +
       

     

       77
       +
       	// Combine all text for cleaning

     

       78
       +
       	const allText = segments.map((s) => s.text).join(" ");

     

       79
       +
       

     

       80
       +
       	const { cleanedTranscript, error } = await cleanTranscript({

     

       81
       +
       		transcriptId: transcriptionId,

     

       82
       +
       		rawTranscript: allText,

     

       83
       +
       	});

     

       84
       +
       

     

       85
       +
       	if (error) {

     

       86
       +
       		console.warn(`[VTTCleaner] Falling back to original VTT: ${error}`);

     

       87
       +
       		return vttContent;

     

       88
       +
       	}

     

       89
       +
       

     

       90
       +
       	// Split cleaned text back into segments

     

       91
       +
       	// Use simple word-based splitting proportional to original segment lengths

     

       92
       +
       	const words = cleanedTranscript.split(/\s+/);

     

       93
       +
       	const originalWords = allText.split(/\s+/);

     

       94
       +
       	const ratio = words.length / originalWords.length;

     

       95
       +
       

     

       96
       +
       	let wordIndex = 0;

     

       97
       +
       	const cleanedSegments: VTTSegment[] = [];

     

       98
       +
       

     

       99
       +
       	for (const segment of segments) {

     

       100
       +
       		const originalWordCount = segment.text.split(/\s+/).length;

     

       101
       +
       		const newWordCount = Math.max(1, Math.round(originalWordCount * ratio));

     

       102
       +
       		const segmentWords = words.slice(wordIndex, wordIndex + newWordCount);

     

       103
       +
       		wordIndex += newWordCount;

     

       104
       +
       

     

       105
       +
       		cleanedSegments.push({

     

       106
       +
       			timestamp: segment.timestamp,

     

       107
       +
       			text: segmentWords.join(" "),

     

       108
       +
       			index: segment.index,

     

       109
       +
       		});

     

       110
       +
       	}

     

       111
       +
       

     

       112
       +
       	// Rebuild VTT

     

       113
       +
       	let output = "WEBVTT\n\n";

     

       114
       +
       	for (const segment of cleanedSegments) {

     

       115
       +
       		if (segment.index !== undefined) {

     

       116
       +
       			output += `${segment.index}\n`;

     

       117
       +
       		}

     

       118
       +
       		output += `${segment.timestamp}\n`;

     

       119
       +
       		output += `${segment.text}\n\n`;

     

       120
       +
       	}

     

       121
       +
       

     

       122
       +
       	console.log(`[VTTCleaner] Completed for ${transcriptionId}`);

     

       123
       +
       

     

       124
       +
       	return output;

     

       125
       +
       }