commit 2c25d3a2b5ffcbc02aae4ac8194b73274b14bfa2 · dunkirk.sh/thistle

+72 -17

src/components/transcription.ts

···

       1
        
       import { css, html, LitElement } from "lit";

     

       2
        
       import { customElement, state } from "lit/decorators.js";

     

       0
        
       
     

       3
        
       

     

       4
        
       interface TranscriptionJob {

     

       5
        
       	id: string;

     
···

       10
        
       	created_at: number;

     

       11
        
       	audioUrl?: string;

     

       12
        
       	vttSegments?: VTTSegment[];

     

       0
        
       
     

       13
        
       }

     

       14
        
       

     

       15
        
       interface VTTSegment {

     

       16
        
       	start: number;

     

       17
        
       	end: number;

     

       18
        
       	text: string;

     

       0
        
       
     

       19
        
       }

     

       20
        
       

     

       21
        
       

     
···

       26
        
       

     

       27
        
       	let i = 0;

     

       28
        
       	// Skip WEBVTT header

     

       29
       -
       	while (i < lines.length && !lines[i]?.includes("-->")) {

     

       30
        
       		i++;

     

       31
        
       	}

     

       0
        
       
     

       32
        
       

     

       33
        
       	while (i < lines.length) {

     

       34
       -
       		const line = lines[i];

     

       35
       -
       		if (line?.includes("-->")) {

     

       36
       -
       			const [startStr, endStr] = line.split("-->").map((s) => s.trim());

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       37
        
       			const start = parseVTTTimestamp(startStr || "");

     

       38
        
       			const end = parseVTTTimestamp(endStr || "");

     

       39
        
       

     
···

       49
        
       				start,

     

       50
        
       				end,

     

       51
        
       				text: textLines.join(" ").trim(),

     

       0
        
       
     

       52
        
       			});

     

       0
        
       
     

       0
        
       
     

       53
        
       		}

     

       54
       -
       		i++;

     

       55
        
       	}

     

       56
        
       

     

       57
        
       	return segments;

     
···

       305
        
             border-radius: 2px;

     

       306
        
           }

     

       307
        
       

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       308
        
           .audio-player {

     

       309
        
             margin-top: 1rem;

     

       310
        
             width: 100%;

     
···

       546
        
       			if (response.ok) {

     

       547
        
       				const vttContent = await response.text();

     

       548
        
       				const segments = parseVTT(vttContent);

     

       549
       -
       				

     

       550
       -
       				// Update job with VTT segments

     

       551
        
       				const job = this.jobs.find((j) => j.id === jobId);

     

       552
        
       				if (job) {

     

       0
        
       
     

       553
        
       					job.vttSegments = segments;

     

       554
        
       					job.audioUrl = `/api/transcriptions/${jobId}/audio`;

     

       555
        
       					this.jobs = [...this.jobs];

     
···

       656
        
       			"audio/wav", // WAV

     

       657
        
       			"audio/x-wav", // WAV (alternative)

     

       658
        
       			"audio/m4a", // M4A

     

       0
        
       
     

       659
        
       			"audio/mp4", // MP4 audio

     

       660
        
       			"audio/aac", // AAC

     

       661
        
       			"audio/ogg", // OGG

     
···

       716
        
       	}

     

       717
        
       

     

       718
        
       	private renderTranscript(job: TranscriptionJob) {

     

       719
       -
       		if (!job.vttSegments) {

     

       720
        
       			const displayed = this.displayedTranscripts.get(job.id) || "";

     

       721
        
       			return displayed;

     

       722
        
       		}

     

       723
        
       

     

       724
       -
       		const segments = job.vttSegments;

     

       725
       -
       		// Render segments as clickable spans

     

       726
       -
       		return html`${segments.map(

     

       727
       -
       			(segment, idx) => html`<span

     

       728
       -
                 class="segment"

     

       729
       -
                 data-start="${segment.start}"

     

       730
       -
                 data-end="${segment.end}"

     

       731
       -
               >${segment.text}</span>${idx < segments.length - 1 ? " " : ""}`,

     

       732
       -
       		)}`;

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       733
        
       	}

     

       734
        
       

     

       735

···

       1
        
       import { css, html, LitElement } from "lit";

     

       2
        
       import { customElement, state } from "lit/decorators.js";

     

       3
       +
       import { parseVTT } from "../lib/vtt-cleaner";

     

       4
        
       

     

       5
        
       interface TranscriptionJob {

     

       6
        
       	id: string;

     
···

       11
        
       	created_at: number;

     

       12
        
       	audioUrl?: string;

     

       13
        
       	vttSegments?: VTTSegment[];

     

       14
       +
       	vttContent?: string;

     

       15
        
       }

     

       16
        
       

     

       17
        
       interface VTTSegment {

     

       18
        
       	start: number;

     

       19
        
       	end: number;

     

       20
        
       	text: string;

     

       21
       +
       	index?: string;

     

       22
        
       }

     

       23
        
       

     

       24
        
       

     
···

       29
        
       

     

       30
        
       	let i = 0;

     

       31
        
       	// Skip WEBVTT header

     

       32
       +
       	while (i < lines.length && lines[i]?.trim() !== "WEBVTT") {

     

       33
        
       		i++;

     

       34
        
       	}

     

       35
       +
       	i++; // Skip WEBVTT

     

       36
        
       

     

       37
        
       	while (i < lines.length) {

     

       38
       +
       		let index: string | undefined;

     

       39
       +
       		// Check for cue ID (line before timestamp)

     

       40
       +
       		if (lines[i]?.trim() && !lines[i]?.includes("-->")) {

     

       41
       +
       			index = lines[i]?.trim();

     

       42
       +
       			i++;

     

       43
       +
       		}

     

       44
       +
       

     

       45
       +
       		if (i < lines.length && lines[i]?.includes("-->")) {

     

       46
       +
       			const [startStr, endStr] = lines[i].split("-->").map((s) => s.trim());

     

       47
        
       			const start = parseVTTTimestamp(startStr || "");

     

       48
        
       			const end = parseVTTTimestamp(endStr || "");

     

       49
        
       

     
···

       59
        
       				start,

     

       60
        
       				end,

     

       61
        
       				text: textLines.join(" ").trim(),

     

       62
       +
       				index,

     

       63
        
       			});

     

       64
       +
       		} else {

     

       65
       +
       			i++;

     

       66
        
       		}

     

       0
        
       
     

       67
        
       	}

     

       68
        
       

     

       69
        
       	return segments;

     
···

       317
        
             border-radius: 2px;

     

       318
        
           }

     

       319
        
       

     

       320
       +
           .paragraph {

     

       321
       +
             display: block;

     

       322
       +
             margin: 0 0 1rem 0;

     

       323
       +
             line-height: 1.6;

     

       324
       +
           }

     

       325
       +
       

     

       326
        
           .audio-player {

     

       327
        
             margin-top: 1rem;

     

       328
        
             width: 100%;

     
···

       564
        
       			if (response.ok) {

     

       565
        
       				const vttContent = await response.text();

     

       566
        
       				const segments = parseVTT(vttContent);

     

       567
       +
       

     

       568
       +
       				// Update job with VTT content and segments

     

       569
        
       				const job = this.jobs.find((j) => j.id === jobId);

     

       570
        
       				if (job) {

     

       571
       +
       					job.vttContent = vttContent;

     

       572
        
       					job.vttSegments = segments;

     

       573
        
       					job.audioUrl = `/api/transcriptions/${jobId}/audio`;

     

       574
        
       					this.jobs = [...this.jobs];

     
···

       675
        
       			"audio/wav", // WAV

     

       676
        
       			"audio/x-wav", // WAV (alternative)

     

       677
        
       			"audio/m4a", // M4A

     

       678
       +
       			"audio/x-m4a", // M4A (alternative)

     

       679
        
       			"audio/mp4", // MP4 audio

     

       680
        
       			"audio/aac", // AAC

     

       681
        
       			"audio/ogg", // OGG

     
···

       736
        
       	}

     

       737
        
       

     

       738
        
       	private renderTranscript(job: TranscriptionJob) {

     

       739
       +
       		if (!job.vttContent) {

     

       740
        
       			const displayed = this.displayedTranscripts.get(job.id) || "";

     

       741
        
       			return displayed;

     

       742
        
       		}

     

       743
        
       

     

       744
       +
       		const segments = parseVTT(job.vttContent);

     

       745
       +
       		// Group segments by paragraph (extract paragraph number from ID like "Paragraph 1-1" -> "1")

     

       746
       +
       		const paragraphGroups = new Map<string, typeof segments>();

     

       747
       +
       		for (const segment of segments) {

     

       748
       +
       		const id = (segment.index || '').trim();

     

       749
       +
       		const match = id.match(/^Paragraph\s+(\d+)-/);

     

       750
       +
       		const paraNum = match ? match[1] : '0';

     

       751
       +
       		if (!paragraphGroups.has(paraNum)) {

     

       752
       +
       		paragraphGroups.set(paraNum, []);

     

       753
       +
       		}

     

       754
       +
       		paragraphGroups.get(paraNum)!.push(segment);

     

       755
       +
       		}

     

       756
       +
       

     

       757
       +
       		// Render each paragraph group

     

       758
       +
       		const paragraphs = Array.from(paragraphGroups.entries()).map(([paraNum, groupSegments]) => {

     

       759
       +
       		// Concatenate all text in the group

     

       760
       +
       		const fullText = groupSegments.map(s => s.text || '').join(' ');

     

       761
       +
       		// Split into sentences

     

       762
       +
       		const sentences = fullText.split(/(?<=[\.\!\?])\s+/g).filter(Boolean);

     

       763
       +
       		// Calculate word counts for timing

     

       764
       +
       		const wordCounts = sentences.map((s) => s.split(/\s+/).filter(Boolean).length);

     

       765
       +
       		 const totalWords = Math.max(1, wordCounts.reduce((a, b) => a + b, 0));

     

       766
       +
       

     

       767
       +
       			// Overall paragraph timing

     

       768
       +
       			const paraStart = Math.min(...groupSegments.map(s => s.start ?? 0));

     

       769
       +
       			const paraEnd = Math.max(...groupSegments.map(s => s.end ?? paraStart));

     

       770
       +
       

     

       771
       +
       			let acc = 0;

     

       772
       +
       			const paraDuration = paraEnd - paraStart;

     

       773
       +
       

     

       774
       +
       			return html`<div class="paragraph">

     

       775
       +
       			${sentences.map((sent, si) => {

     

       776
       +
       			const startOffset = (acc / totalWords) * paraDuration;

     

       777
       +
       			acc += wordCounts[si];

     

       778
       +
       			const sentenceDuration = (wordCounts[si] / totalWords) * paraDuration;

     

       779
       +
       			const endOffset = si < sentences.length - 1 ? startOffset + sentenceDuration - 0.001 : paraEnd - paraStart;

     

       780
       +
       			const spanStart = paraStart + startOffset;

     

       781
       +
       			const spanEnd = paraStart + endOffset;

     

       782
       +
       			 return html`<span class="segment" data-start="${spanStart}" data-end="${spanEnd}">${sent}</span>${si < sentences.length - 1 ? ' ' : ''}`;

     

       783
       +
       			 })}

     

       784
       +
       			</div>`;

     

       785
       +
       		});

     

       786
       +
       

     

       787
       +
       		return html`${paragraphs}`;

     

       788
        
       	}

     

       789
        
       

     

       790

+78 -37

src/lib/transcript-cleaner.test.ts

···

       1
        
       import { test, expect } from "bun:test";

     

       2
       -
       import { cleanTranscript } from "./transcript-cleaner";

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       3
        
       

     

       4
       -
       test("cleanTranscript removes tags and fixes grammar", async () => {

     

       5
       -
       	const rawTranscript = `[SIDE CONVERSATION] Yes? So with this course packet, what quiz is and exams, and if I can study through here, what you talk about? And I give you a good review every time. Yeah, so I'd be good to just study that and then we can do it. Yeah, and all the examples and stuff that we get from class especially. And then I, like your first quiz, I give you a mock quiz exactly like the quiz. Oh, okay. so you can kind of get a feel for how I do things. [inaudible] Okay? [inaudible] Yeah. [background chatter]`;

     

       6
        
       

     

       7
       -
       	const result = await cleanTranscript({

     

       8
       -
       		transcriptId: "test-123",

     

       9
       -
       		rawTranscript,

     

       10
       -
       	});

     

       11
        
       

     

       12
       -
       	// Check that tags are removed

     

       13
       -
       	expect(result.cleanedTranscript).not.toContain("[SIDE CONVERSATION]");

     

       14
       -
       	expect(result.cleanedTranscript).not.toContain("[inaudible]");

     

       15
       -
       	expect(result.cleanedTranscript).not.toContain("[background chatter]");

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       16
        
       

     

       17
       -
       	// Check that we got some text back

     

       18
       -
       	expect(result.cleanedTranscript.length).toBeGreaterThan(0);

     

       19
       -
       	expect(result.cleanedTranscript.length).toBeLessThan(rawTranscript.length);

     

       20
        
       

     

       21
       -
       	console.log("Original:", rawTranscript.substring(0, 100));

     

       22
       -
       	console.log("Cleaned:", result.cleanedTranscript.substring(0, 100));

     

       0
        
       
     

       23
        
       }, 30000); // 30s timeout for API call

     

       24
        
       

     

       25
       -
       test("cleanTranscript handles empty transcript", async () => {

     

       26
       -
       	const result = await cleanTranscript({

     

       27
       -
       		transcriptId: "test-empty",

     

       28
       -
       		rawTranscript: "",

     

       29
       -
       	});

     

       0
        
       
     

       0
        
       
     

       30
        
       

     

       31
       -
       	expect(result.cleanedTranscript).toBe("");

     

       32
        
       });

     

       33
        
       

     

       34
       -
       test("cleanTranscript falls back to raw transcript on API error", async () => {

     

       35
       -
       	const rawTranscript = "Test transcript";

     

       36
        
       

     

       37
       -
       	// Test with missing API key (if it's actually set, this test might fail)

     

       38
       -
       	const originalKey = process.env.GEMINI_API_KEY;

     

       39
       -
       	delete process.env.GEMINI_API_KEY;

     

       40
        
       

     

       41
       -
       	const result = await cleanTranscript({

     

       42
       -
       		transcriptId: "test-fallback",

     

       43
       -
       		rawTranscript,

     

       44
       -
       	});

     

       0
        
       
     

       0
        
       
     

       45
        
       

     

       46
       -
       	expect(result.cleanedTranscript).toBe(rawTranscript);

     

       47
       -
       	expect(result.error).toBe("GEMINI_API_KEY not set");

     

       48
        
       

     

       49
       -
       	// Restore key

     

       50
       -
       	if (originalKey) {

     

       51
       -
       		process.env.GEMINI_API_KEY = originalKey;

     

       52
       -
       	}

     

       53
        
       });

···

       1
        
       import { test, expect } from "bun:test";

     

       2
       +
       import { cleanAndGetParagraphBoundaries } from "./transcript-cleaner";

     

       3
       +
       

     

       4
       +
       test("cleanAndGetParagraphBoundaries cleans transcript and returns paragraph boundaries", async () => {

     

       5
       +
       // Use a longer, more realistic transcript sample with natural paragraph breaks

     

       6
       +
       const rawTranscript = `[SIDE CONVERSATION] Today in chapel we are talking about the fact that we believe in having gospel conversations. I'm gonna run my own PowerPoint. I'm gonna jump around. It's gonna be a little more conversational than normal. It's not gonna be like one of the normal sermons, although I know me and my tendency it'll turn into a sermon at some point just because that's the way God made me, so I can't help it.

     

       7
       +
       

     

       8
       +
       Alright, so when it starts just have fun with it. We'll go on. Here's what it says in our doctrinal statement. It says, "Due to the commission of Christ and the urgency of the Gospel, all believers are to engage in Gospel conversations." How many of you believe that? That's pretty weak. How many of you believe that?

     

       9
        
       

     

       10
       +
       To live God-honoring lives and to work continuously for the spread of the Gospel to their neighbors and the nations. Now, let's be honest, as we start off this morning, all of us could do a better job with personal evangelism, and all of us could do a better job with a heart for missions.

     

       0
        
       
     

       11
        
       

     

       12
       +
       So I'm not up here talking to you about something I have conquered or mastered. I'm not the expert on this. In fact, when it comes to personal evangelism in my own strength, I'm often a complete failure. But I have found that even in my weakness, God can use me in powerful ways when I make myself available to Him.`;

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       13
        
       

     

       14
       +
       // Create mock segments from raw transcript (simulating whisper output)

     

       15
       +
       const sentences = rawTranscript.split(/\.\s+/);

     

       16
       +
       const mockSegments: { index?: number; start?: number; end?: number; text: string }[] = [];

     

       17
       +
       let timeOffset = 0;

     

       18
       +
       for (let i = 0; i < sentences.length; i++) {

     

       19
       +
       const sentence = sentences[i]?.trim();

     

       20
       +
       if (!sentence) continue;

     

       21
       +
       const duration = sentence.split(/\s+/).length * 0.3; // ~0.3s per word

     

       22
       +
       mockSegments.push({

     

       23
       +
       index: i,

     

       24
       +
       start: timeOffset,

     

       25
       +
       end: timeOffset + duration,

     

       26
       +
       text: sentence,

     

       27
       +
       });

     

       28
       +
       timeOffset += duration;

     

       29
       +
       }

     

       30
       +
       

     

       31
       +
       const result = await cleanAndGetParagraphBoundaries({

     

       32
       +
       transcriptId: "test-123",

     

       33
       +
       rawTranscript,

     

       34
       +
       segments: mockSegments,

     

       35
       +
       maxWordsMove: 3,

     

       36
       +
       });

     

       37
       +
       

     

       38
       +
       // Check that we got a result

     

       39
       +
       expect(result.paragraphs).toBeDefined();

     

       40
       +
       expect(result.paragraphs!.length).toBeGreaterThan(1); // Should have multiple paragraphs

     

       41
       +
       

     

       42
       +
       // Check that paragraphs have the expected structure

     

       43
       +
       for (const para of result.paragraphs!) {

     

       44
       +
       		expect(para).toHaveProperty('startSegmentIndex');

     

       45
       +
        expect(para).toHaveProperty('endSegmentIndex');

     

       46
       +
        expect(para).toHaveProperty('text');

     

       47
       +
        expect(para.text.length).toBeGreaterThan(0);

     

       48
       +
       }

     

       49
       +
       

     

       50
       +
       // The cleaned text should have tags removed

     

       51
       +
       const cleanedText = result.paragraphs!.map(p => p.text).join(' ');

     

       52
        
       

     

       53
       +
       expect(cleanedText).not.toContain("[SIDE CONVERSATION]");

     

       54
       +
       expect(cleanedText.toLowerCase()).toContain("gospel");

     

       55
       +
       expect(cleanedText.toLowerCase()).toContain("evangelism");

     

       56
        
       

     

       57
       +
       	console.log(`Detected ${result.paragraphs!.length} paragraphs from ${mockSegments.length} segments`);

     

       58
       +
       	console.log("First paragraph:", result.paragraphs![0]?.text.substring(0, 100) + "...");

     

       59
       +
       	console.log("Last paragraph:", result.paragraphs![result.paragraphs!.length - 1]?.text.substring(0, 100) + "...");

     

       60
        
       }, 30000); // 30s timeout for API call

     

       61
        
       

     

       62
       +
       test("cleanAndGetParagraphBoundaries handles empty transcript", async () => {

     

       63
       +
       const result = await cleanAndGetParagraphBoundaries({

     

       64
       +
       transcriptId: "test-empty",

     

       65
       +
       rawTranscript: "",

     

       66
       +
       segments: [],

     

       67
       +
       maxWordsMove: 3,

     

       68
       +
       });

     

       69
        
       

     

       70
       +
       expect(result.paragraphs).toEqual([]);

     

       71
        
       });

     

       72
        
       

     

       73
       +
       test("cleanAndGetParagraphBoundaries returns error on missing API key", async () => {

     

       74
       +
       const rawTranscript = "Test transcript";

     

       75
        
       

     

       76
       +
       // Test with missing API key (if it's actually set, this test might fail)

     

       77
       +
       const originalKey = process.env.OPENROUTER_API_KEY;

     

       78
       +
       delete process.env.OPENROUTER_API_KEY;

     

       79
        
       

     

       80
       +
       const result = await cleanAndGetParagraphBoundaries({

     

       81
       +
       transcriptId: "test-fallback",

     

       82
       +
       rawTranscript,

     

       83
       +
       segments: [{ text: rawTranscript }],

     

       84
       +
       maxWordsMove: 3,

     

       85
       +
       });

     

       86
        
       

     

       87
       +
       expect(result.paragraphs).toBeUndefined();

     

       88
       +
       expect(result.error).toBe("OPENROUTER_API_KEY not set");

     

       89
        
       

     

       90
       +
       // Restore key

     

       91
       +
       if (originalKey) {

     

       92
       +
       process.env.OPENROUTER_API_KEY = originalKey;

     

       93
       +
       }

     

       94
        
       });

+91 -84

src/lib/transcript-cleaner.ts

···

       1
       -
       // Clean up transcripts using Gemini to remove tags and fix grammar

     

       2
       -
       

     

       3
       -
       interface CleanTranscriptOptions {

     

       4
       -
       	transcriptId: string;

     

       5
       -
       	rawTranscript: string;

     

       6
       -
       }

     

       7
       -
       

     

       8
       -
       interface CleanTranscriptResult {

     

       9
       -
       	cleanedTranscript: string;

     

       10
       -
       	error?: string;

     

       11
        
       }

     

       12
        
       

     

       13
       -
       /**

     

       14
       -
        * Clean transcript using Gemini Flash 2.0 (cheapest model)

     

       15
       -
        * Removes tags like [SIDE CONVERSATION], [inaudible], etc.

     

       16
       -
        * Fixes grammar while preserving sentence structure

     

       17
       -
        */

     

       18
       -
       export async function cleanTranscript({

     

       19
        
       	transcriptId,

     

       20
        
       	rawTranscript,

     

       21
       -
       }: CleanTranscriptOptions): Promise<CleanTranscriptResult> {

     

       22
       -
       	const apiKey = process.env.GEMINI_API_KEY;

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       23
        
       

     

       0
        
       
     

       0
        
       
     

       24
        
       	if (!apiKey) {

     

       25
       -
       		return {

     

       26
       -
       			cleanedTranscript: rawTranscript,

     

       27
       -
       			error: "GEMINI_API_KEY not set",

     

       28
       -
       		};

     

       29
        
       	}

     

       30
        
       

     

       31
       -
       	// Skip cleaning if transcript is empty

     

       32
       -
       	if (!rawTranscript || rawTranscript.trim().length === 0) {

     

       33
       -
       		return {

     

       34
       -
       			cleanedTranscript: rawTranscript,

     

       35
       -
       		};

     

       36
       -
       	}

     

       0
        
       
     

       37
        
       

     

       38
       -
       	console.log(

     

       39
       -
       		`[TranscriptCleaner] Starting cleanup for ${transcriptId} (${rawTranscript.length} chars)`,

     

       40
       -
       	);

     

       41
        
       

     

       42
       -
       	try {

     

       43
       -
       		const prompt = `You are a transcript editor. Clean up this transcript by:

     

       44
        
       1. Removing ALL tags like [SIDE CONVERSATION], [inaudible], [background chatter], etc.

     

       45
        
       2. Fixing grammar and punctuation to make sentences readable

     

       46
        
       3. Preserving the original sentence structure and wording as much as possible

     
···

       48
        
       5. NOT adding any new content or changing the meaning

     

       49
        
       6. If there are obvious speaking mistakes then you can fix those (e.g. "we are going no wait sorry you should be doing")

     

       50
        
       

     

       51
       -
       Return ONLY the cleaned transcript text, nothing else.

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       52
        
       

     

       53
       -
       Transcript to clean:

     

       54
        
       ${rawTranscript}`;

     

       55
        
       

     

       56
        
       		const response = await fetch(

     

       57
       -
       			"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-exp:generateContent",

     

       58
        
       			{

     

       59
        
       				method: "POST",

     

       60
        
       				headers: {

     

       61
        
       					"Content-Type": "application/json",

     

       62
       -
       					"x-goog-api-key": apiKey,

     

       0
        
       
     

       0
        
       
     

       63
        
       				},

     

       64
        
       				body: JSON.stringify({

     

       65
       -
       					contents: [

     

       66
       -
       						{

     

       67
       -
       							parts: [{ text: prompt }],

     

       68
       -
       						},

     

       69
        
       					],

     

       70
       -
       					generationConfig: {

     

       71
       -
       						temperature: 0.3,

     

       72
       -
       						topK: 40,

     

       73
       -
       						topP: 0.95,

     

       74
       -
       						maxOutputTokens: 8192,

     

       75
       -
       					},

     

       76
        
       				}),

     

       77
        
       			},

     

       78
        
       		);

     

       79
        
       

     

       80
        
       		if (!response.ok) {

     

       81
        
       			const errorText = await response.text();

     

       82
       -
       			console.error(

     

       83
       -
       				`[TranscriptCleaner] Gemini API error for ${transcriptId}:`,

     

       84
       -
       				errorText,

     

       85
       -
       			);

     

       86
       -
       			return {

     

       87
       -
       				cleanedTranscript: rawTranscript,

     

       88
       -
       				error: `Gemini API error: ${response.status}`,

     

       89
       -
       			};

     

       90
        
       		}

     

       91
        
       

     

       92
        
       		const result = await response.json();

     

       93
       -
       		const cleanedText =

     

       94
       -
       			result.candidates?.[0]?.content?.parts?.[0]?.text?.trim();

     

       0
        
       
     

       0
        
       
     

       95
        
       

     

       96
       -
       		if (!cleanedText) {

     

       97
       -
       			console.warn(

     

       98
       -
       				`[TranscriptCleaner] Empty response from Gemini for ${transcriptId}`,

     

       99
       -
       			);

     

       100
       -
       			return {

     

       101
       -
       				cleanedTranscript: rawTranscript,

     

       102
       -
       				error: "Empty response from Gemini",

     

       103
       -
       			};

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       104
        
       		}

     

       105
        
       

     

       106
       -
       		const reduction = Math.round(

     

       107
       -
       			((rawTranscript.length - cleanedText.length) / rawTranscript.length) *

     

       108
       -
       				100,

     

       109
       -
       		);

     

       110
       -
       		console.log(

     

       111
       -
       			`[TranscriptCleaner] Completed for ${transcriptId}: ${rawTranscript.length} → ${cleanedText.length} chars (${reduction}% reduction)`,

     

       112
       -
       		);

     

       113
        
       

     

       114
       -
       		return {

     

       115
       -
       			cleanedTranscript: cleanedText,

     

       116
       -
       		};

     

       117
       -
       	} catch (error) {

     

       118
       -
       		console.error(

     

       119
       -
       			`[TranscriptCleaner] Failed to clean ${transcriptId}:`,

     

       120
       -
       			error,

     

       121
       -
       		);

     

       122
       -
       		return {

     

       123
       -
       			cleanedTranscript: rawTranscript,

     

       124
       -
       			error: error instanceof Error ? error.message : "Unknown error",

     

       125
       -
       		};

     

       126
        
       	}

     

       127
        
       }

···

       1
       +
       // Paragraph boundary detection using OpenRouter. Returns a JSON array of paragraph objects.

     

       2
       +
       export interface ParagraphBoundary {

     

       3
       +
       	startSegmentIndex: number;

     

       4
       +
       	endSegmentIndex: number;

     

       5
       +
       	text: string;

     

       6
       +
       	// Optional: list of moved words for auditing

     

       7
       +
       	movedWords?: { word: string; fromSegmentIndex: number; toSegmentIndex: number }[];

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       8
        
       }

     

       9
        
       

     

       10
       +
       // Cleans transcript and determines paragraph boundaries in one LLM request.

     

       11
       +
       // Returns paragraph boundaries as JSON array.

     

       12
       +
       export async function cleanAndGetParagraphBoundaries({

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       13
        
       	transcriptId,

     

       14
        
       	rawTranscript,

     

       15
       +
       	segments,

     

       16
       +
       	maxWordsMove = 0,

     

       17
       +
       }: {

     

       18
       +
       	transcriptId: string;

     

       19
       +
       	rawTranscript: string;

     

       20
       +
       	segments: { index?: number; start?: number; end?: number; text: string }[];

     

       21
       +
       	maxWordsMove?: number;

     

       22
       +
       }): Promise<{ paragraphs?: ParagraphBoundary[]; error?: string }> {

     

       23
       +
       	// Skip processing if transcript is empty

     

       24
       +
       	if (!rawTranscript || rawTranscript.trim().length === 0) {

     

       25
       +
       		return { paragraphs: [] };

     

       26
       +
       	}

     

       27
        
       

     

       28
       +
       	const apiKey = process.env.OPENROUTER_API_KEY;

     

       29
       +
       	const model = process.env.OPENROUTER_MODEL || "openrouter/polaris-alpha";

     

       30
        
       	if (!apiKey) {

     

       31
       +
       		return { error: "OPENROUTER_API_KEY not set" };

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       32
        
       	}

     

       33
        
       

     

       34
       +
       	try {

     

       35
       +
       		const segmentsPayload = segments.map((s) => ({

     

       36
       +
       			index: s.index ?? null,

     

       37
       +
       			start: s.start ?? null,

     

       38
       +
       			end: s.end ?? null,

     

       39
       +
       			text: s.text ?? "",

     

       40
       +
       		}));

     

       41
        
       

     

       42
       +
       		const prompt = `You are a transcript editor and paragrapher. Input: a list of original transcript segments with their index, start time (seconds), end time (seconds), and the RAW transcript text.

     

       0
        
       
     

       0
        
       
     

       43
        
       

     

       44
       +
       Your task: First, clean the transcript by:

     

       0
        
       
     

       45
        
       1. Removing ALL tags like [SIDE CONVERSATION], [inaudible], [background chatter], etc.

     

       46
        
       2. Fixing grammar and punctuation to make sentences readable

     

       47
        
       3. Preserving the original sentence structure and wording as much as possible

     
···

       49
        
       5. NOT adding any new content or changing the meaning

     

       50
        
       6. If there are obvious speaking mistakes then you can fix those (e.g. "we are going no wait sorry you should be doing")

     

       51
        
       

     

       52
       +
       Then, determine paragraph boundaries by grouping the cleaned segments into logical paragraphs. A paragraph represents a complete thought, topic, or idea. Create MULTIPLE paragraphs based on:

     

       53
       +
       - Natural topic changes or shifts in the speaker's focus

     

       54
       +
       - Pauses or transitions in the speech ("Now...", "So...", "Let me tell you...", "Alright...")

     

       55
       +
       - Complete narrative beats or examples

     

       56
       +
       - Typical spoken paragraph length (30-120 seconds / 5-20 segments)

     

       57
       +
       

     

       58
       +
       CRITICAL: Each paragraph MUST end with a complete sentence. DO NOT break paragraphs mid-sentence.

     

       59
       +
       

     

       60
       +
       RETURN ONLY a JSON array of objects, EXACTLY in this format (no additional text):

     

       61
       +
       

     

       62
       +
       [ {"startSegmentIndex": <int>, "endSegmentIndex": <int>, "text": "<paragraph text>"}, ... ]

     

       63
       +
       

     

       64
       +
       Rules for paragraphing:

     

       65
       +
       - ALWAYS end paragraphs at sentence boundaries (after periods, question marks, or exclamation points)

     

       66
       +
       - NEVER break a paragraph in the middle of a sentence

     

       67
       +
       - Create AT LEAST one paragraph for every 30-60 seconds of speech (roughly 5-10 segments)

     

       68
       +
       - DO NOT put the entire transcript in a single paragraph

     

       69
       +
       - Paragraphs must reference original segment indexes

     

       70
       +
       - Do not move words across segment boundaries

     

       71
       +
       - Return the paragraphs in order and cover the entire cleaned transcript text without overlap or omission

     

       72
       +
       

     

       73
       +
       Segments:

     

       74
       +
       ${JSON.stringify(segmentsPayload, null, 2)}

     

       75
        
       

     

       76
       +
       Raw Transcript:

     

       77
        
       ${rawTranscript}`;

     

       78
        
       

     

       79
        
       		const response = await fetch(

     

       80
       +
       			"https://openrouter.ai/api/v1/chat/completions",

     

       81
        
       			{

     

       82
        
       				method: "POST",

     

       83
        
       				headers: {

     

       84
        
       					"Content-Type": "application/json",

     

       85
       +
       					"Authorization": `Bearer ${apiKey}`,

     

       86
       +
       					"HTTP-Referer": "https://thistle.app",

     

       87
       +
       					"X-Title": "Thistle Transcription",

     

       88
        
       				},

     

       89
        
       				body: JSON.stringify({

     

       90
       +
       					model,

     

       91
       +
       					messages: [

     

       92
       +
       						{ role: "user", content: prompt },

     

       0
        
       
     

       93
        
       					],

     

       94
       +
       					temperature: 0.0,

     

       95
       +
       					max_tokens: 8192,

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       96
        
       				}),

     

       97
        
       			},

     

       98
        
       		);

     

       99
        
       

     

       100
        
       		if (!response.ok) {

     

       101
        
       			const errorText = await response.text();

     

       102
       +
       			console.error(`[Paragrapher] OpenRouter error for ${transcriptId}:`, errorText);

     

       103
       +
       			return { error: `OpenRouter API error: ${response.status}` };

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       104
        
       		}

     

       105
        
       

     

       106
        
       		const result = await response.json();

     

       107
       +
       		const raw = result.choices?.[0]?.message?.content?.trim();

     

       108
       +
       		if (!raw) {

     

       109
       +
       			return { error: "Empty paragrapher response" };

     

       110
       +
       		}

     

       111
        
       

     

       112
       +
       		let parsed: ParagraphBoundary[] | null = null;

     

       113
       +
       		try {

     

       114
       +
       			parsed = JSON.parse(raw) as ParagraphBoundary[];

     

       115
       +
       		} catch (e) {

     

       116
       +
       			// Attempt to extract JSON substring if model padded text

     

       117
       +
       			const firstBracket = raw.indexOf("[");

     

       118
       +
       			const lastBracket = raw.lastIndexOf("]");

     

       119
       +
       			if (firstBracket >= 0 && lastBracket > firstBracket) {

     

       120
       +
       				const substr = raw.substring(firstBracket, lastBracket + 1);

     

       121
       +
       				parsed = JSON.parse(substr) as ParagraphBoundary[];

     

       122
       +
       			}

     

       123
        
       		}

     

       124
        
       

     

       125
       +
       		if (!parsed || !Array.isArray(parsed)) {

     

       126
       +
       			return { error: "Failed to parse paragrapher JSON" };

     

       127
       +
       		}

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       128
        
       

     

       129
       +
       		return { paragraphs: parsed };

     

       130
       +
       	} catch (err) {

     

       131
       +
       		console.error("[Paragrapher] Exception:", err);

     

       132
       +
       		return { error: err instanceof Error ? err.message : "Unknown error" };

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       133
        
       	}

     

       134
        
       }

+13 -8

src/lib/transcription.ts

···

       3
        
       import { ErrorCode } from "./errors";

     

       4
        
       import { saveTranscriptVTT } from "./transcript-storage";

     

       5
        
       import { cleanVTT } from "./vtt-cleaner";

     

       0
        
       
     

       6
        
       

     

       7
        
       // Constants

     

       8
        
       export const MAX_FILE_SIZE = 100 * 1024 * 1024; // 100MB

     
···

       300
        
       						`${this.serviceUrl}/transcribe/${whisperJobId}?format=vtt`,

     

       301
        
       					);

     

       302
        
       					if (vttResponse.ok) {

     

       303
       -
       						const vttContent = await vttResponse.text();

     

       304
       -
       						const cleanedVTT = await cleanVTT(transcriptionId, vttContent);

     

       305
       -
       						await saveTranscriptVTT(transcriptionId, cleanedVTT);

     

       306
       -
       					}

     

       0
        
       
     

       307
        
       				} catch (error) {

     

       308
        
       					console.warn(

     

       309
        
       						`[Transcription] Failed to fetch VTT for ${transcriptionId}:`,

     
···

       361
        
       			status?: TranscriptionStatus;

     

       362
        
       			progress?: number;

     

       363
        
       			error_message?: string;

     

       0
        
       
     

       364
        
       		},

     

       365
        
       	) {

     

       366
        
       		const updates: string[] = [];

     
···

       378
        
       			updates.push("error_message = ?");

     

       379
        
       			values.push(data.error_message);

     

       380
        
       		}

     

       0
        
       
     

       381
        
       

     

       382
        
       		updates.push("updated_at = ?");

     

       383
        
       		values.push(Math.floor(Date.now() / 1000));

     
···

       519
        
       						`${this.serviceUrl}/transcribe/${whisperJob.id}?format=vtt`,

     

       520
        
       					);

     

       521
        
       					if (vttResponse.ok) {

     

       522
       -
       						const vttContent = await vttResponse.text();

     

       523
       -
       						const cleanedVTT = await cleanVTT(transcriptionId, vttContent);

     

       524
       -
       						await saveTranscriptVTT(transcriptionId, cleanedVTT);

     

       525
       -
       					}

     

       0
        
       
     

       526
        
       				} catch (error) {

     

       527
        
       					console.warn(

     

       528
        
       						`[Sync] Failed to fetch VTT for ${transcriptionId}:`,

···

       3
        
       import { ErrorCode } from "./errors";

     

       4
        
       import { saveTranscriptVTT } from "./transcript-storage";

     

       5
        
       import { cleanVTT } from "./vtt-cleaner";

     

       6
       +
       import { parseVTT } from "./vtt-cleaner";

     

       7
        
       

     

       8
        
       // Constants

     

       9
        
       export const MAX_FILE_SIZE = 100 * 1024 * 1024; // 100MB

     
···

       301
        
       						`${this.serviceUrl}/transcribe/${whisperJobId}?format=vtt`,

     

       302
        
       					);

     

       303
        
       					if (vttResponse.ok) {

     

       304
       +
       					const vttContent = await vttResponse.text();

     

       305
       +
       					const cleanedVTT = await cleanVTT(transcriptionId, vttContent);

     

       306
       +
       					await saveTranscriptVTT(transcriptionId, cleanedVTT);

     

       307
       +
       					this.updateTranscription(transcriptionId, {});

     

       308
       +
       				}

     

       309
        
       				} catch (error) {

     

       310
        
       					console.warn(

     

       311
        
       						`[Transcription] Failed to fetch VTT for ${transcriptionId}:`,

     
···

       363
        
       			status?: TranscriptionStatus;

     

       364
        
       			progress?: number;

     

       365
        
       			error_message?: string;

     

       366
       +
       			vttContent?: string;

     

       367
        
       		},

     

       368
        
       	) {

     

       369
        
       		const updates: string[] = [];

     
···

       381
        
       			updates.push("error_message = ?");

     

       382
        
       			values.push(data.error_message);

     

       383
        
       		}

     

       384
       +
       

     

       385
        
       

     

       386
        
       		updates.push("updated_at = ?");

     

       387
        
       		values.push(Math.floor(Date.now() / 1000));

     
···

       523
        
       						`${this.serviceUrl}/transcribe/${whisperJob.id}?format=vtt`,

     

       524
        
       					);

     

       525
        
       					if (vttResponse.ok) {

     

       526
       +
       					const vttContent = await vttResponse.text();

     

       527
       +
       					const cleanedVTT = await cleanVTT(transcriptionId, vttContent);

     

       528
       +
       					await saveTranscriptVTT(transcriptionId, cleanedVTT);

     

       529
       +
       					this.updateTranscription(transcriptionId, {});

     

       530
       +
       				}

     

       531
        
       				} catch (error) {

     

       532
        
       					console.warn(

     

       533
        
       						`[Sync] Failed to fetch VTT for ${transcriptionId}:`,

+44

src/lib/vtt-cleaner.test.ts

···

       1
        
       import { test, expect } from "bun:test";

     

       2
        
       import { cleanVTT } from "./vtt-cleaner";

     

       0
        
       
     

       0
        
       
     

       3
        
       

     

       4
        
       const sampleVTT = `WEBVTT

     

       5
        
       

     
···

       36
        
       

     

       37
        
       	expect(result).toBe(emptyVTT);

     

       38
        
       });

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0

···

       1
        
       import { test, expect } from "bun:test";

     

       2
        
       import { cleanVTT } from "./vtt-cleaner";

     

       3
       +
       import { readFileSync } from "fs";

     

       4
       +
       import { join } from "path";

     

       5
        
       

     

       6
        
       const sampleVTT = `WEBVTT

     

       7
        
       

     
···

       38
        
       

     

       39
        
       	expect(result).toBe(emptyVTT);

     

       40
        
       });

     

       41
       +
       

     

       42
       +
       test("cleanVTT detects multiple paragraphs", async () => {

     

       43
       +
       	const multiParaVTT = `WEBVTT

     

       44
       +
       

     

       45
       +
       Paragraph 1-1

     

       46
       +
       00:00:00.000 --> 00:00:00.000

     

       47
       +
       Again, thank you for the privilege to not only study here, but also to teach here. Jesus,

     

       48
       +
       

     

       49
       +
       Paragraph 1-2

     

       50
       +
       00:00:00.000 --> 00:00:00.000

     

       51
       +
       thank you. All`;

     

       52
       +
       

     

       53
       +
       	const result = await cleanVTT("test-multi-para", multiParaVTT);

     

       54
       +
       

     

       55
       +
       	expect(result).toContain("Paragraph 1-1");

     

       56
       +
       	expect(result).toContain("Paragraph 2-1");

     

       57
       +
       	// Should have at least two paragraphs

     

       58
       +
       	const paraMatches = result.match(/Paragraph \d+-\d+/g);

     

       59
       +
       	expect(paraMatches?.length).toBeGreaterThan(1);

     

       60
       +
       }, 30000);

     

       61
       +
       

     

       62
       +
       test("cleanVTT with real transcription data", async () => {

     

       63
       +
       	const originalApiKey = process.env.OPENROUTER_API_KEY;

     

       64
       +
       	// Temporarily unset to force fallback

     

       65
       +
       	delete process.env.OPENROUTER_API_KEY;

     

       66
       +
       

     

       67
       +
       	try {

     

       68
       +
       		const vttPath = join(__dirname, "../../transcripts/d69d8076-598a-4fe5-8100-fe3eff47fcd6.vtt");

     

       69
       +
       		const realVTT = readFileSync(vttPath, "utf-8");

     

       70
       +
       

     

       71
       +
       		const result = await cleanVTT("real-test", realVTT);

     

       72
       +
       

     

       73
       +
       		expect(result).toContain("WEBVTT");

     

       74
       +
       		// Check that it has multiple paragraph numbers

     

       75
       +
       		const paraMatches = result.match(/Paragraph (\d+)-\d+/g);

     

       76
       +
       		const uniqueParas = new Set(paraMatches?.map(m => m.match(/Paragraph (\d+)/)?.[1]));

     

       77
       +
       		expect(uniqueParas.size).toBeGreaterThan(1);

     

       78
       +
       		console.log("Paragraphs found:", uniqueParas.size);

     

       79
       +
       	} finally {

     

       80
       +
       		process.env.OPENROUTER_API_KEY = originalApiKey;

     

       81
       +
       	}

     

       82
       +
       }, 30000);

+147 -29

src/lib/vtt-cleaner.ts

···

       1
        
       // Parse and clean VTT files

     

       2
        
       

     

       3
       -
       import { cleanTranscript } from "./transcript-cleaner";

     

       4
        
       

     

       5
        
       interface VTTSegment {

     

       6
        
       	index?: number;

     

       7
        
       	timestamp: string;

     

       8
        
       	text: string;

     

       0
        
       
     

       0
        
       
     

       9
        
       }

     

       10
        
       

     

       11
        
       /**

     

       12
       -
        * Parse VTT content into segments

     

       13
        
        */

     

       14
       -
       function parseVTT(vttContent: string): VTTSegment[] {

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       15
        
       	const lines = vttContent.split("\n");

     

       16
        
       	const segments: VTTSegment[] = [];

     

       17
        
       	let currentSegment: Partial<VTTSegment> = {};

     
···

       21
        
       

     

       22
        
       		if (!line) {

     

       23
        
       			if (currentSegment.timestamp && currentSegment.text) {

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       24
        
       				segments.push(currentSegment as VTTSegment);

     

       25
        
       				currentSegment = {};

     

       26
        
       			}

     
···

       31
        
       			continue;

     

       32
        
       		}

     

       33
        
       

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       34
        
       		// Check if it's a timestamp line

     

       35
        
       		if (line.includes("-->")) {

     

       36
        
       			currentSegment.timestamp = line;

     

       37
        
       			// Next line(s) will be text

     

       38
        
       			const textLines: string[] = [];

     

       39
        
       			i++;

     

       40
       -
       			while (i < lines.length && lines[i]?.trim() && !lines[i]?.includes("-->")) {

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       41
        
       				textLines.push(lines[i] || "");

     

       42
        
       				i++;

     

       43
        
       			}

     
···

       51
        
       

     

       52
        
       	// Add last segment if exists

     

       53
        
       	if (currentSegment.timestamp && currentSegment.text) {

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       54
        
       		segments.push(currentSegment as VTTSegment);

     

       55
        
       	}

     

       56
        
       

     
···

       58
        
       }

     

       59
        
       

     

       60
        
       /**

     

       61
       -
        * Clean VTT text segments by removing tags and fixing grammar

     

       0
        
       
     

       0
        
       
     

       62
        
        */

     

       63
        
       export async function cleanVTT(

     

       64
        
       	transcriptionId: string,

     
···

       71
        
       	}

     

       72
        
       

     

       73
        
       	console.log(

     

       74
       -
       		`[VTTCleaner] Cleaning ${segments.length} segments for ${transcriptionId}`,

     

       75
        
       	);

     

       76
        
       

     

       77
       -
       	// Combine all text for cleaning

     

       78
        
       	const allText = segments.map((s) => s.text).join(" ");

     

       79
        
       

     

       80
       -
       	const { cleanedTranscript, error } = await cleanTranscript({

     

       81
       -
       		transcriptId: transcriptionId,

     

       82
       -
       		rawTranscript: allText,

     

       83
       -
       	});

     

       84
        
       

     

       85
       -
       	if (error) {

     

       86
       -
       		console.warn(`[VTTCleaner] Falling back to original VTT: ${error}`);

     

       87
       -
       		return vttContent;

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       88
        
       	}

     

       89
        
       

     

       90
       -
       	// Split cleaned text back into segments

     

       91
       -
       	// Use simple word-based splitting proportional to original segment lengths

     

       92
       -
       	const words = cleanedTranscript.split(/\s+/);

     

       93
       -
       	const originalWords = allText.split(/\s+/);

     

       94
       -
       	const ratio = words.length / originalWords.length;

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       95
        
       

     

       96
        
       	let wordIndex = 0;

     

       97
        
       	const cleanedSegments: VTTSegment[] = [];

     

       98
        
       

     

       99
        
       	for (const segment of segments) {

     

       100
       -
       		const originalWordCount = segment.text.split(/\s+/).length;

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       101
        
       		const newWordCount = Math.max(1, Math.round(originalWordCount * ratio));

     

       102
        
       		const segmentWords = words.slice(wordIndex, wordIndex + newWordCount);

     

       103
        
       		wordIndex += newWordCount;

     

       104
        
       

     

       105
        
       		cleanedSegments.push({

     

       0
        
       
     

       106
        
       			timestamp: segment.timestamp,

     

       107
        
       			text: segmentWords.join(" "),

     

       108
       -
       			index: segment.index,

     

       0
        
       
     

       109
        
       		});

     

       110
        
       	}

     

       111
        
       

     

       112
       -
       	// Rebuild VTT

     

       113
       -
       	let output = "WEBVTT\n\n";

     

       114
       -
       	for (const segment of cleanedSegments) {

     

       115
       -
       		if (segment.index !== undefined) {

     

       116
       -
       			output += `${segment.index}\n`;

     

       0
        
       
     

       0
        
       
     

       117
        
       		}

     

       118
       -
       		output += `${segment.timestamp}\n`;

     

       119
       -
       		output += `${segment.text}\n\n`;

     

       120
        
       	}

     

       121
        
       

     

       122
       -
       	console.log(`[VTTCleaner] Completed for ${transcriptionId}`);

     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       0
        
       
     

       123
        
       

     

       124
        
       	return output;

     

       125
        
       }

···

       1
        
       // Parse and clean VTT files

     

       2
        
       

     

       3
       +
       import type { ParagraphBoundary } from "./transcript-cleaner";

     

       4
        
       

     

       5
        
       interface VTTSegment {

     

       6
        
       	index?: number;

     

       7
        
       	timestamp: string;

     

       8
        
       	text: string;

     

       9
       +
       	start?: number;

     

       10
       +
       	end?: number;

     

       11
        
       }

     

       12
        
       

     

       13
        
       /**

     

       14
       +
        * Parse a VTT timestamp string (hh:mm:ss.mmm or mm:ss.mmm) into seconds

     

       15
        
        */

     

       16
       +
       function parseTimestampToSeconds(ts?: string): number {

     

       17
       +
       	if (!ts) return 0;

     

       18
       +
       	// ts expected like "00:00:09.039"

     

       19
       +
       	const parts = ts.split(":").map((p) => p.trim());

     

       20
       +
       	const hh = parts[0] ?? "0";

     

       21
       +
       	const mm = parts[1] ?? "0";

     

       22
       +
       	const ss = parts[2] ?? "0";

     

       23
       +
       	if (parts.length === 3) {

     

       24
       +
       		const seconds =

     

       25
       +
       			parseInt(hh, 10) * 3600 + parseInt(mm, 10) * 60 + parseFloat(ss);

     

       26
       +
       		return seconds;

     

       27
       +
       	} else if (parts.length === 2) {

     

       28
       +
       		return parseInt(mm, 10) * 60 + parseFloat(ss);

     

       29
       +
       	}

     

       30
       +
       	return 0;

     

       31
       +
       }

     

       32
       +
       

     

       33
       +
       /**

     

       34
       +
        * Parse VTT content into segments, populating start/end in seconds

     

       35
       +
        */

     

       36
       +
       export function parseVTT(vttContent: string): VTTSegment[] {

     

       37
        
       	const lines = vttContent.split("\n");

     

       38
        
       	const segments: VTTSegment[] = [];

     

       39
        
       	let currentSegment: Partial<VTTSegment> = {};

     
···

       43
        
       

     

       44
        
       		if (!line) {

     

       45
        
       			if (currentSegment.timestamp && currentSegment.text) {

     

       46
       +
       				// parse start/end

     

       47
       +
       				const match = /([\d:.]+)\s*-->\s*([\d:.]+)/.exec(

     

       48
       +
       					currentSegment.timestamp || "",

     

       49
       +
       				);

     

       50
       +
       				if (match) {

     

       51
       +
       					currentSegment.start = parseTimestampToSeconds(match[1]);

     

       52
       +
       					currentSegment.end = parseTimestampToSeconds(match[2]);

     

       53
       +
       				}

     

       54
        
       				segments.push(currentSegment as VTTSegment);

     

       55
        
       				currentSegment = {};

     

       56
        
       			}

     
···

       61
        
       			continue;

     

       62
        
       		}

     

       63
        
       

     

       64
       +
       		// Check if it's a cue id (before timestamp)

     

       65
       +
       		if (!currentSegment.timestamp && line && !line.includes("-->")) {

     

       66
       +
       			currentSegment.index = line;

     

       67
       +
       			continue;

     

       68
       +
       		}

     

       69
       +
       

     

       70
        
       		// Check if it's a timestamp line

     

       71
        
       		if (line.includes("-->")) {

     

       72
        
       			currentSegment.timestamp = line;

     

       73
        
       			// Next line(s) will be text

     

       74
        
       			const textLines: string[] = [];

     

       75
        
       			i++;

     

       76
       +
       			while (

     

       77
       +
       				i < lines.length &&

     

       78
       +
       				lines[i]?.trim() &&

     

       79
       +
       				!lines[i]?.includes("-->")

     

       80
       +
       			) {

     

       81
        
       				textLines.push(lines[i] || "");

     

       82
        
       				i++;

     

       83
        
       			}

     
···

       91
        
       

     

       92
        
       	// Add last segment if exists

     

       93
        
       	if (currentSegment.timestamp && currentSegment.text) {

     

       94
       +
       		const match = /([\d:.]+)\s*-->\s*([\d:.]+)/.exec(

     

       95
       +
       			currentSegment.timestamp || "",

     

       96
       +
       		);

     

       97
       +
       		if (match?.[1] && match[2]) {

     

       98
       +
       			currentSegment.start = parseTimestampToSeconds(match[1]);

     

       99
       +
       			currentSegment.end = parseTimestampToSeconds(match[2]);

     

       100
       +
       		}

     

       101
        
       		segments.push(currentSegment as VTTSegment);

     

       102
        
       	}

     

       103
        
       

     
···

       105
        
       }

     

       106
        
       

     

       107
        
       /**

     

       108
       +
        * Clean VTT text segments by removing tags and fixing grammar.

     

       109
       +
        * Additionally, merge cleaned segments into paragraph cues while preserving

     

       110
       +
        * stable paragraph IDs (derived from first segment start time).

     

       111
        
        */

     

       112
        
       export async function cleanVTT(

     

       113
        
       	transcriptionId: string,

     
···

       120
        
       	}

     

       121
        
       

     

       122
        
       	console.log(

     

       123
       +
       		`[VTTCleaner] Processing ${segments.length} segments for ${transcriptionId}`,

     

       124
        
       	);

     

       125
        
       

     

       126
       +
       	// Combine all text for cleaning and paragraphing

     

       127
        
       	const allText = segments.map((s) => s.text).join(" ");

     

       128
        
       

     

       129
       +
       	// Attempt LLM-driven cleaning and paragraphing in one request, fallback to deterministic rules

     

       130
       +
       	let paragraphBoundaries: ParagraphBoundary[] = [];

     

       0
        
       
     

       0
        
       
     

       131
        
       

     

       132
       +
       	try {

     

       133
       +
       		const { cleanAndGetParagraphBoundaries } = await import(

     

       134
       +
       			"./transcript-cleaner"

     

       135
       +
       		);

     

       136
       +
       		const result = await cleanAndGetParagraphBoundaries({

     

       137
       +
       			transcriptId: transcriptionId,

     

       138
       +
       			rawTranscript: allText,

     

       139
       +
       			segments: segments.map((s) => ({

     

       140
       +
       				index: s.index,

     

       141
       +
       				start: s.start,

     

       142
       +
       				end: s.end,

     

       143
       +
       				text: s.text,

     

       144
       +
       			})),

     

       145
       +
       			maxWordsMove: 0,

     

       146
       +
       		});

     

       147
       +
       

     

       148
       +
       		if (result?.paragraphs) {

     

       149
       +
       			paragraphBoundaries = result.paragraphs;

     

       150
       +
       		}

     

       151
       +
       	} catch (e) {

     

       152
       +
       		console.warn(

     

       153
       +
       			"[VTTCleaner] Consolidated LLM failed, no paragraph detection:",

     

       154
       +
       			e,

     

       155
       +
       		);

     

       156
        
       	}

     

       157
        
       

     

       158
       +
       	if (paragraphBoundaries.length === 0) {

     

       159
       +
       		// No paragraphs detected, treat as one big paragraph

     

       160
       +
       		paragraphBoundaries = [

     

       161
       +
       			{

     

       162
       +
       				startSegmentIndex: 0,

     

       163
       +
       				endSegmentIndex: segments.length - 1,

     

       164
       +
       				text: allText,

     

       165
       +
       			},

     

       166
       +
       		];

     

       167
       +
       	}

     

       168
       +
       

     

       169
       +
       	// Get the full cleaned transcript from paragraphs

     

       170
       +
       	const cleanedTranscript = paragraphBoundaries.map((p) => p.text).join(" ");

     

       171
       +
       

     

       172
       +
       	// Split cleaned text back into segments proportionally (word-based)

     

       173
       +
       	const words = cleanedTranscript.split(/\s+/).filter(Boolean);

     

       174
       +
       	const originalWords = allText.split(/\s+/).filter(Boolean);

     

       175
       +
       	const ratio = words.length / Math.max(1, originalWords.length);

     

       176
        
       

     

       177
        
       	let wordIndex = 0;

     

       178
        
       	const cleanedSegments: VTTSegment[] = [];

     

       179
        
       

     

       180
        
       	for (const segment of segments) {

     

       181
       +
       		const originalWordCount = Math.max(

     

       182
       +
       			1,

     

       183
       +
       			segment.text.split(/\s+/).filter(Boolean).length,

     

       184
       +
       		);

     

       185
        
       		const newWordCount = Math.max(1, Math.round(originalWordCount * ratio));

     

       186
        
       		const segmentWords = words.slice(wordIndex, wordIndex + newWordCount);

     

       187
        
       		wordIndex += newWordCount;

     

       188
        
       

     

       189
        
       		cleanedSegments.push({

     

       190
       +
       			index: segment.index,

     

       191
        
       			timestamp: segment.timestamp,

     

       192
        
       			text: segmentWords.join(" "),

     

       193
       +
       			start: segment.start,

     

       194
       +
       			end: segment.end,

     

       195
        
       		});

     

       196
        
       	}

     

       197
        
       

     

       198
       +
       	// If any remaining words, append to last segment

     

       199
       +
       	if (wordIndex < words.length && cleanedSegments.length > 0) {

     

       200
       +
       		const rest = words.slice(wordIndex).join(" ");

     

       201
       +
       		const lastIdx = cleanedSegments.length - 1;

     

       202
       +
       		const lastSeg = cleanedSegments[lastIdx];

     

       203
       +
       		if (lastSeg) {

     

       204
       +
       			lastSeg.text += (lastSeg.text ? " " : "") + rest;

     

       205
        
       		}

     

       0
        
       
     

       0
        
       
     

       206
        
       	}

     

       207
        
       

     

       208
       +
       	// Assign paragraph-based IDs to segments

     

       209
       +
       	for (let i = 0; i < cleanedSegments.length; i++) {

     

       210
       +
       		const seg = cleanedSegments[i];

     

       211
       +
       		if (!seg) continue;

     

       212
       +
       

     

       213
       +
       		// Find which paragraph this segment belongs to

     

       214
       +
       		let paraIndex = 0;

     

       215
       +
       		let segmentInPara = 1;

     

       216
       +
       		for (let p = 0; p < paragraphBoundaries.length; p++) {

     

       217
       +
       			const para = paragraphBoundaries[p];

     

       218
       +
       			if (i >= para.startSegmentIndex && i <= para.endSegmentIndex) {

     

       219
       +
       				paraIndex = p + 1;

     

       220
       +
       				segmentInPara = i - para.startSegmentIndex + 1;

     

       221
       +
       				break;

     

       222
       +
       			}

     

       223
       +
       		}

     

       224
       +
       

     

       225
       +
       		// Use paragraph-based ID: "Paragraph N-M" where N is paragraph number, M is segment within paragraph

     

       226
       +
       		seg.index = `Paragraph ${paraIndex}-${segmentInPara}`;

     

       227
       +
       	}

     

       228
       +
       

     

       229
       +
       	// Build output VTT with cleaned segment cues having paragraph-based IDs

     

       230
       +
       	let output = "WEBVTT\n\n";

     

       231
       +
       	for (const seg of cleanedSegments) {

     

       232
       +
       		if (!seg || !seg.timestamp || !seg.text) continue;

     

       233
       +
       		output += `${seg.index}\n`;

     

       234
       +
       		output += `${seg.timestamp}\n`;

     

       235
       +
       		output += `${seg.text}\n\n`;

     

       236
       +
       	}

     

       237
       +
       

     

       238
       +
       	console.log(

     

       239
       +
       		`[VTTCleaner] Completed for ${transcriptionId}: ${cleanedSegments.length} segments in ${paragraphBoundaries.length} paragraphs`,

     

       240
       +
       	);

     

       241
        
       

     

       242
        
       	return output;

     

       243
        
       }