commit ed81b19054eac2c83cc09cf15fc9a08fb59d24e3 · dunkirk.sh/thistle

-1

src/lib/transcription.ts

···

       3
       3
        
       import { ErrorCode } from "./errors";

     

       4
       4
        
       import { saveTranscriptVTT } from "./transcript-storage";

     

       5
       5
        
       import { cleanVTT } from "./vtt-cleaner";

     

       6
       6
       -
       import { parseVTT } from "./vtt-cleaner";

     

       7
       6
        
       

     

       8
       7
        
       // Constants

     

       9
       8
        
       export const MAX_FILE_SIZE = 100 * 1024 * 1024; // 100MB

+53 -52

src/lib/vtt-cleaner.test.ts

···

       1
       1
        
       import { test, expect } from "bun:test";

     

       2
       2
       -
       import { cleanVTT } from "./vtt-cleaner";

     

       3
       3
       -
       import { readFileSync } from "fs";

     

       4
       4
       -
       import { join } from "path";

     

       2
       2
       +
       import { cleanVTT, parseVTT } from "./vtt-cleaner";

     

       5
       3
        
       

     

       6
       4
        
       const sampleVTT = `WEBVTT

     

       7
       5
        
       

     

       8
       6
        
       00:00:00.000 --> 00:00:03.480

     

       9
       7
        
       <|startoftranscript|> [SIDE CONVERSATION]<|endoftext|>

     

       10
       8
        
       

     

       11
       11
       -
       00:00:00.000 --> 00:00:00.000

     

       9
       9
       +
       00:00:03.480 --> 00:00:05.000

     

       12
       10
        
       <|startoftranscript|> Yes?

     

       13
       11
        
       

     

       14
       14
       -
       00:00:00.000 --> 00:00:00.000

     

       12
       12
       +
       00:00:05.000 --> 00:00:08.000

     

       15
       13
        
       So with this course packet, what quiz is and exams, and if I can study through here, what you talk about?

     

       16
       14
        
       

     

       17
       17
       -
       00:00:00.000 --> 00:00:00.000

     

       15
       15
       +
       00:00:08.000 --> 00:00:10.000

     

       18
       16
        
       And I give you a good review every time.

     

       19
       17
        
       

     

       20
       20
       -
       00:00:00.000 --> 00:00:00.000

     

       18
       18
       +
       00:00:10.000 --> 00:00:12.000

     

       21
       19
        
       Yeah, so I'd be good to just study that and then we can do it.`;

     

       22
       20
        
       

     

       23
       23
       -
       test("cleanVTT removes tags and cleans text", async () => {

     

       21
       21
       +
       test("parseVTT extracts segments correctly", () => {

     

       22
       22
       +
       	const segments = parseVTT(sampleVTT);

     

       23
       23
       +
       

     

       24
       24
       +
       	expect(segments.length).toBeGreaterThan(0);

     

       25
       25
       +
       	expect(segments[0]?.timestamp).toContain("-->");

     

       26
       26
       +
       	expect(segments[0]?.text).toBeDefined();

     

       27
       27
       +
       	expect(segments[0]?.start).toBeGreaterThanOrEqual(0);

     

       28
       28
       +
       	expect(segments[0]?.end).toBeGreaterThanOrEqual(0);

     

       29
       29
       +
       });

     

       30
       30
       +
       

     

       31
       31
       +
       test("parseVTT handles empty VTT", () => {

     

       32
       32
       +
       	const emptyVTT = "WEBVTT\n\n";

     

       33
       33
       +
       	const segments = parseVTT(emptyVTT);

     

       34
       34
       +
       

     

       35
       35
       +
       	expect(segments.length).toBe(0);

     

       36
       36
       +
       });

     

       37
       37
       +
       

     

       38
       38
       +
       test("cleanVTT preserves VTT format when AI key not available", async () => {

     

       39
       39
       +
       	// Save original env var

     

       40
       40
       +
       	const originalKey = process.env.LLM_API_KEY;

     

       41
       41
       +
       	

     

       42
       42
       +
       	// Remove key to test fallback

     

       43
       43
       +
       	delete process.env.LLM_API_KEY;

     

       44
       44
       +
       	

     

       24
       45
        
       	const result = await cleanVTT("test-vtt", sampleVTT);

     

       25
       46
        
       

     

       26
       47
        
       	expect(result).toContain("WEBVTT");

     

       27
       27
       -
       	expect(result).not.toContain("[SIDE CONVERSATION]");

     

       28
       28
       -
       	expect(result).not.toContain("<|startoftranscript|>");

     

       29
       29
       -
       	expect(result).not.toContain("<|endoftext|>");

     

       30
       48
        
       	expect(result).toContain("-->");

     

       31
       31
       -
       

     

       32
       32
       -
       	console.log("Cleaned VTT preview:", result.substring(0, 200));

     

       33
       33
       -
       }, 30000);

     

       49
       49
       +
       	

     

       50
       50
       +
       	// Restore original key

     

       51
       51
       +
       	if (originalKey) {

     

       52
       52
       +
       		process.env.LLM_API_KEY = originalKey;

     

       53
       53
       +
       	}

     

       54
       54
       +
       });

     

       34
       55
        
       

     

       35
       56
        
       test("cleanVTT preserves empty VTT", async () => {

     

       36
       57
        
       	const emptyVTT = "WEBVTT\n\n";

     
···

       39
       60
        
       	expect(result).toBe(emptyVTT);

     

       40
       61
        
       });

     

       41
       62
        
       

     

       42
       42
       -
       test("cleanVTT detects multiple paragraphs", async () => {

     

       43
       43
       -
       	const multiParaVTT = `WEBVTT

     

       44
       44
       -
       

     

       45
       45
       -
       Paragraph 1-1

     

       46
       46
       -
       00:00:00.000 --> 00:00:00.000

     

       47
       47
       -
       Again, thank you for the privilege to not only study here, but also to teach here. Jesus,

     

       48
       48
       -
       

     

       49
       49
       -
       Paragraph 1-2

     

       50
       50
       -
       00:00:00.000 --> 00:00:00.000

     

       51
       51
       -
       thank you. All`;

     

       52
       52
       -
       

     

       53
       53
       -
       	const result = await cleanVTT("test-multi-para", multiParaVTT);

     

       54
       54
       -
       

     

       55
       55
       -
       	expect(result).toContain("Paragraph 1-1");

     

       56
       56
       -
       	expect(result).toContain("Paragraph 2-1");

     

       57
       57
       -
       	// Should have at least two paragraphs

     

       58
       58
       -
       	const paraMatches = result.match(/Paragraph \d+-\d+/g);

     

       59
       59
       -
       	expect(paraMatches?.length).toBeGreaterThan(1);

     

       60
       60
       -
       }, 30000);

     

       61
       61
       -
       

     

       62
       62
       -
       test("cleanVTT with real transcription data", async () => {

     

       63
       63
       -
       	const originalApiKey = process.env.OPENROUTER_API_KEY;

     

       64
       64
       -
       	// Temporarily unset to force fallback

     

       65
       65
       -
       	delete process.env.OPENROUTER_API_KEY;

     

       66
       66
       -
       

     

       67
       67
       -
       	try {

     

       68
       68
       -
       		const vttPath = join(__dirname, "../../transcripts/d69d8076-598a-4fe5-8100-fe3eff47fcd6.vtt");

     

       69
       69
       -
       		const realVTT = readFileSync(vttPath, "utf-8");

     

       63
       63
       +
       // Integration test - only runs if API key is available

     

       64
       64
       +
       test("cleanVTT uses AI when available", async () => {

     

       65
       65
       +
       	if (!process.env.LLM_API_KEY) {

     

       66
       66
       +
       		console.log("Skipping AI test - no LLM_API_KEY set");

     

       67
       67
       +
       		return;

     

       68
       68
       +
       	}

     

       70
       69
        
       

     

       71
       71
       -
       		const result = await cleanVTT("real-test", realVTT);

     

       70
       70
       +
       	const result = await cleanVTT("test-ai", sampleVTT);

     

       72
       71
        
       

     

       73
       73
       -
       		expect(result).toContain("WEBVTT");

     

       74
       74
       -
       		// Check that it has multiple paragraph numbers

     

       75
       75
       -
       		const paraMatches = result.match(/Paragraph (\d+)-\d+/g);

     

       76
       76
       -
       		const uniqueParas = new Set(paraMatches?.map(m => m.match(/Paragraph (\d+)/)?.[1]));

     

       77
       77
       -
       		expect(uniqueParas.size).toBeGreaterThan(1);

     

       78
       78
       -
       		console.log("Paragraphs found:", uniqueParas.size);

     

       79
       79
       -
       	} finally {

     

       80
       80
       -
       		process.env.OPENROUTER_API_KEY = originalApiKey;

     

       81
       81
       -
       	}

     

       72
       72
       +
       	expect(result).toContain("WEBVTT");

     

       73
       73
       +
       	expect(result).toContain("-->");

     

       74
       74
       +
       	

     

       75
       75
       +
       	// AI should clean up tags

     

       76
       76
       +
       	expect(result).not.toContain("<|startoftranscript|>");

     

       77
       77
       +
       	expect(result).not.toContain("[SIDE CONVERSATION]");

     

       78
       78
       +
       	

     

       79
       79
       +
       	// Should have paragraph formatting

     

       80
       80
       +
       	expect(result).toContain("Paragraph");

     

       81
       81
       +
       	

     

       82
       82
       +
       	console.log("AI-cleaned VTT preview:", result.substring(0, 300));

     

       82
       83
        
       }, 30000);

+97 -105

src/lib/vtt-cleaner.ts

···

       1
       1
       -
       // Parse and clean VTT files

     

       2
       2
       -
       

     

       3
       3
       -
       import type { ParagraphBoundary } from "./transcript-cleaner";

     

       1
       1
       +
       // Parse and clean VTT files using AI

     

       4
       2
        
       

     

       5
       3
        
       interface VTTSegment {

     

       6
       6
       -
       	index?: number;

     

       4
       4
       +
       	index?: number | string;

     

       7
       5
        
       	timestamp: string;

     

       8
       6
        
       	text: string;

     

       9
       7
        
       	start?: number;

     
···

       105
       103
        
       }

     

       106
       104
        
       

     

       107
       105
        
       /**

     

       108
       108
       -
        * Clean VTT text segments by removing tags and fixing grammar.

     

       109
       109
       -
        * Additionally, merge cleaned segments into paragraph cues while preserving

     

       110
       110
       -
        * stable paragraph IDs (derived from first segment start time).

     

       106
       106
       +
        * Clean VTT text using AI to create paragraph-separated VTT file.

     

       107
       107
       +
        * Uses OpenRouter API to intelligently group segments into paragraphs

     

       108
       108
       +
        * while preserving timing information.

     

       111
       109
        
        */

     

       112
       110
        
       export async function cleanVTT(

     

       113
       111
        
       	transcriptionId: string,

     
···

       123
       121
        
       		`[VTTCleaner] Processing ${segments.length} segments for ${transcriptionId}`,

     

       124
       122
        
       	);

     

       125
       123
        
       

     

       126
       126
       -
       	// Combine all text for cleaning and paragraphing

     

       127
       127
       -
       	const allText = segments.map((s) => s.text).join(" ");

     

       128
       128
       -
       

     

       129
       129
       -
       	// Attempt LLM-driven cleaning and paragraphing in one request, fallback to deterministic rules

     

       130
       130
       -
       	let paragraphBoundaries: ParagraphBoundary[] = [];

     

       124
       124
       +
       	const apiKey = process.env.LLM_API_KEY;

     

       125
       125
       +
       	const apiBaseUrl = process.env.LLM_API_BASE_URL;

     

       126
       126
       +
       	const model = process.env.LLM_MODEL;

     

       127
       127
       +
       	

     

       128
       128
       +
       	if (!apiKey || !apiBaseUrl || !model) {

     

       129
       129
       +
       		console.warn("[VTTCleaner] LLM configuration incomplete (need LLM_API_KEY, LLM_API_BASE_URL, LLM_MODEL), returning uncleaned VTT");

     

       130
       130
       +
       		return vttContent;

     

       131
       131
       +
       	}

     

       131
       132
        
       

     

       132
       133
        
       	try {

     

       133
       133
       -
       		const { cleanAndGetParagraphBoundaries } = await import(

     

       134
       134
       -
       			"./transcript-cleaner"

     

       135
       135
       -
       		);

     

       136
       136
       -
       		const result = await cleanAndGetParagraphBoundaries({

     

       137
       137
       -
       			transcriptId: transcriptionId,

     

       138
       138
       -
       			rawTranscript: allText,

     

       139
       139
       -
       			segments: segments.map((s) => ({

     

       140
       140
       -
       				index: s.index,

     

       141
       141
       -
       				start: s.start,

     

       142
       142
       -
       				end: s.end,

     

       143
       143
       -
       				text: s.text,

     

       144
       144
       -
       			})),

     

       145
       145
       -
       			maxWordsMove: 0,

     

       146
       146
       -
       		});

     

       134
       134
       +
       		// Build the input for the AI

     

       135
       135
       +
       		const inputSegments = segments.map((seg, idx) => ({

     

       136
       136
       +
       			index: idx,

     

       137
       137
       +
       			timestamp: seg.timestamp,

     

       138
       138
       +
       			text: seg.text,

     

       139
       139
       +
       		}));

     

       140
       140
       +
       

     

       141
       141
       +
       		const prompt = `Can you turn this into a paragraph separated vtt file?

     

       142
       142
       +
       

     

       143
       143
       +
       Use the format "Paragraph X-Y" where X is the paragraph number and Y is the segment number within that paragraph:

     

       144
       144
       +
       

     

       145
       145
       +
       Paragraph 1-1

     

       146
       146
       +
       00:00:00.000 --> 00:00:05.559

     

       147
       147
       +
       Today in chapel we are talking about the fact that we believe in having gospel

     

       148
       148
       +
       

     

       149
       149
       +
       Paragraph 1-2

     

       150
       150
       +
       00:00:05.559 --> 00:00:08.639

     

       151
       151
       +
       conversations. I'm gonna run my own PowerPoint. I'm gonna jump around. It's

     

       152
       152
       +
       

     

       153
       153
       +
       Paragraph 1-3

     

       154
       154
       +
       00:00:08.639 --> 00:00:11.960

     

       155
       155
       +
       gonna be a little more conversational than normal.

     

       147
       156
        
       

     

       148
       148
       -
       		if (result?.paragraphs) {

     

       149
       149
       -
       			paragraphBoundaries = result.paragraphs;

     

       150
       150
       -
       		}

     

       151
       151
       -
       	} catch (e) {

     

       152
       152
       -
       		console.warn(

     

       153
       153
       -
       			"[VTTCleaner] Consolidated LLM failed, no paragraph detection:",

     

       154
       154
       -
       			e,

     

       155
       155
       -
       		);

     

       156
       156
       -
       	}

     

       157
       157
       +
       Paragraph 2-1

     

       158
       158
       +
       00:00:11.960 --> 00:00:15.000

     

       159
       159
       +
       Now let's talk about something different.

     

       157
       160
        
       

     

       158
       158
       -
       	if (paragraphBoundaries.length === 0) {

     

       159
       159
       -
       		// No paragraphs detected, treat as one big paragraph

     

       160
       160
       -
       		paragraphBoundaries = [

     

       161
       161
       -
       			{

     

       162
       162
       -
       				startSegmentIndex: 0,

     

       163
       163
       -
       				endSegmentIndex: segments.length - 1,

     

       164
       164
       -
       				text: allText,

     

       165
       165
       -
       			},

     

       166
       166
       -
       		];

     

       167
       167
       -
       	}

     

       161
       161
       +
       I want you to preserve sentences across paragraph breaks moving whatever is the smallest amount out to its own segment block.

     

       168
       162
        
       

     

       169
       169
       -
       	// Get the full cleaned transcript from paragraphs

     

       170
       170
       -
       	const cleanedTranscript = paragraphBoundaries.map((p) => p.text).join(" ");

     

       163
       163
       +
       Also go through and rewrite the words to extract the meaning and not necessarily the exact phrasing if it sounds unnatural when written. I want the text to remain lined up with the original though so don't rewrite entire paragraphs but you can remove ums, alrights, and similar. Also remove all contextual tags like [background noise]. Add punctuation if it's missing to make the text readable. If there is no more context to fit a segment then just skip it and move to the next one.

     

       171
       164
        
       

     

       172
       172
       -
       	// Split cleaned text back into segments proportionally (word-based)

     

       173
       173
       -
       	const words = cleanedTranscript.split(/\s+/).filter(Boolean);

     

       174
       174
       -
       	const originalWords = allText.split(/\s+/).filter(Boolean);

     

       175
       175
       -
       	const ratio = words.length / Math.max(1, originalWords.length);

     

       165
       165
       +
       Input segments:

     

       166
       166
       +
       ${JSON.stringify(inputSegments, null, 2)}

     

       176
       167
        
       

     

       177
       177
       -
       	let wordIndex = 0;

     

       178
       178
       -
       	const cleanedSegments: VTTSegment[] = [];

     

       168
       168
       +
       Return ONLY the VTT content starting with "WEBVTT" and nothing else. No explanations or additional text.`;

     

       179
       169
        
       

     

       180
       180
       -
       	for (const segment of segments) {

     

       181
       181
       -
       		const originalWordCount = Math.max(

     

       182
       182
       -
       			1,

     

       183
       183
       -
       			segment.text.split(/\s+/).filter(Boolean).length,

     

       170
       170
       +
       		const response = await fetch(

     

       171
       171
       +
       			`${apiBaseUrl}/chat/completions`,

     

       172
       172
       +
       			{

     

       173
       173
       +
       				method: "POST",

     

       174
       174
       +
       				headers: {

     

       175
       175
       +
       					"Content-Type": "application/json",

     

       176
       176
       +
       					"Authorization": `Bearer ${apiKey}`,

     

       177
       177
       +
       					"HTTP-Referer": "https://thistle.app",

     

       178
       178
       +
       					"X-Title": "Thistle Transcription",

     

       179
       179
       +
       				},

     

       180
       180
       +
       				body: JSON.stringify({

     

       181
       181
       +
       					model,

     

       182
       182
       +
       					messages: [

     

       183
       183
       +
       						{ role: "user", content: prompt },

     

       184
       184
       +
       					],

     

       185
       185
       +
       					temperature: 0.3,

     

       186
       186
       +
       					max_tokens: 16384,

     

       187
       187
       +
       				}),

     

       188
       188
       +
       			},

     

       184
       189
        
       		);

     

       185
       185
       -
       		const newWordCount = Math.max(1, Math.round(originalWordCount * ratio));

     

       186
       186
       -
       		const segmentWords = words.slice(wordIndex, wordIndex + newWordCount);

     

       187
       187
       -
       		wordIndex += newWordCount;

     

       190
       190
       +
       

     

       191
       191
       +
       		if (!response.ok) {

     

       192
       192
       +
       			const errorText = await response.text();

     

       193
       193
       +
       			console.error(`[VTTCleaner] OpenRouter error for ${transcriptionId}:`, errorText);

     

       194
       194
       +
       			console.warn("[VTTCleaner] Falling back to uncleaned VTT");

     

       195
       195
       +
       			return vttContent;

     

       196
       196
       +
       		}

     

       188
       197
        
       

     

       189
       189
       -
       		cleanedSegments.push({

     

       190
       190
       -
       			index: segment.index,

     

       191
       191
       -
       			timestamp: segment.timestamp,

     

       192
       192
       -
       			text: segmentWords.join(" "),

     

       193
       193
       -
       			start: segment.start,

     

       194
       194
       -
       			end: segment.end,

     

       195
       195
       -
       		});

     

       196
       196
       -
       	}

     

       198
       198
       +
       		const result = await response.json();

     

       199
       199
       +
       		const cleanedVTT = result.choices?.[0]?.message?.content?.trim();

     

       197
       200
        
       

     

       198
       198
       -
       	// If any remaining words, append to last segment

     

       199
       199
       -
       	if (wordIndex < words.length && cleanedSegments.length > 0) {

     

       200
       200
       -
       		const rest = words.slice(wordIndex).join(" ");

     

       201
       201
       -
       		const lastIdx = cleanedSegments.length - 1;

     

       202
       202
       -
       		const lastSeg = cleanedSegments[lastIdx];

     

       203
       203
       -
       		if (lastSeg) {

     

       204
       204
       -
       			lastSeg.text += (lastSeg.text ? " " : "") + rest;

     

       201
       201
       +
       		if (!cleanedVTT) {

     

       202
       202
       +
       			console.warn("[VTTCleaner] Empty response from AI, returning uncleaned VTT");

     

       203
       203
       +
       			return vttContent;

     

       205
       204
        
       		}

     

       206
       206
       -
       	}

     

       207
       205
        
       

     

       208
       208
       -
       	// Assign paragraph-based IDs to segments

     

       209
       209
       -
       	for (let i = 0; i < cleanedSegments.length; i++) {

     

       210
       210
       -
       		const seg = cleanedSegments[i];

     

       211
       211
       -
       		if (!seg) continue;

     

       206
       206
       +
       		// Extract VTT content if the model wrapped it in markdown

     

       207
       207
       +
       		let finalVTT = cleanedVTT;

     

       208
       208
       +
       		if (cleanedVTT.includes("```")) {

     

       209
       209
       +
       			const vttMatch = cleanedVTT.match(/```(?:vtt)?\n([\s\S]*?)```/);

     

       210
       210
       +
       			if (vttMatch?.[1]) {

     

       211
       211
       +
       				finalVTT = vttMatch[1].trim();

     

       212
       212
       +
       			}

     

       213
       213
       +
       		}

     

       212
       214
        
       

     

       213
       213
       -
       		// Find which paragraph this segment belongs to

     

       214
       214
       -
       		let paraIndex = 0;

     

       215
       215
       -
       		let segmentInPara = 1;

     

       216
       216
       -
       		for (let p = 0; p < paragraphBoundaries.length; p++) {

     

       217
       217
       -
       			const para = paragraphBoundaries[p];

     

       218
       218
       -
       			if (i >= para.startSegmentIndex && i <= para.endSegmentIndex) {

     

       219
       219
       -
       				paraIndex = p + 1;

     

       220
       220
       -
       				segmentInPara = i - para.startSegmentIndex + 1;

     

       221
       221
       -
       				break;

     

       215
       215
       +
       		// Ensure it starts with WEBVTT

     

       216
       216
       +
       		if (!finalVTT.startsWith("WEBVTT")) {

     

       217
       217
       +
       			const webvttIndex = finalVTT.indexOf("WEBVTT");

     

       218
       218
       +
       			if (webvttIndex !== -1) {

     

       219
       219
       +
       				finalVTT = finalVTT.substring(webvttIndex);

     

       220
       220
       +
       			} else {

     

       221
       221
       +
       				finalVTT = `WEBVTT\n\n${finalVTT}`;

     

       222
       222
        
       			}

     

       223
       223
        
       		}

     

       224
       224
        
       

     

       225
       225
       -
       		// Use paragraph-based ID: "Paragraph N-M" where N is paragraph number, M is segment within paragraph

     

       226
       226
       -
       		seg.index = `Paragraph ${paraIndex}-${segmentInPara}`;

     

       227
       227
       -
       	}

     

       225
       225
       +
       		console.log(

     

       226
       226
       +
       			`[VTTCleaner] Successfully cleaned ${segments.length} segments using AI`,

     

       227
       227
       +
       		);

     

       228
       228
        
       

     

       229
       229
       -
       	// Build output VTT with cleaned segment cues having paragraph-based IDs

     

       230
       230
       -
       	let output = "WEBVTT\n\n";

     

       231
       231
       -
       	for (const seg of cleanedSegments) {

     

       232
       232
       -
       		if (!seg || !seg.timestamp || !seg.text) continue;

     

       233
       233
       -
       		output += `${seg.index}\n`;

     

       234
       234
       -
       		output += `${seg.timestamp}\n`;

     

       235
       235
       -
       		output += `${seg.text}\n\n`;

     

       229
       229
       +
       		return finalVTT;

     

       230
       230
       +
       	} catch (err) {

     

       231
       231
       +
       		console.error("[VTTCleaner] Exception:", err);

     

       232
       232
       +
       		console.warn("[VTTCleaner] Falling back to uncleaned VTT");

     

       233
       233
       +
       		return vttContent;

     

       236
       234
        
       	}

     

       237
       237
       -
       

     

       238
       238
       -
       	console.log(

     

       239
       239
       -
       		`[VTTCleaner] Completed for ${transcriptionId}: ${cleanedSegments.length} segments in ${paragraphBoundaries.length} paragraphs`,

     

       240
       240
       -
       	);

     

       241
       241
       -
       

     

       242
       242
       -
       	return output;

     

       243
       235
        
       }