commit 2c25d3a2b5ffcbc02aae4ac8194b73274b14bfa2 · dunkirk.sh/thistle

+72 -17

src/components/transcription.ts

···

       1
       1
        
       import { css, html, LitElement } from "lit";

     

       2
       2
        
       import { customElement, state } from "lit/decorators.js";

     

       3
       3
       +
       import { parseVTT } from "../lib/vtt-cleaner";

     

       3
       4
        
       

     

       4
       5
        
       interface TranscriptionJob {

     

       5
       6
        
       	id: string;

     
···

       10
       11
        
       	created_at: number;

     

       11
       12
        
       	audioUrl?: string;

     

       12
       13
        
       	vttSegments?: VTTSegment[];

     

       14
       14
       +
       	vttContent?: string;

     

       13
       15
        
       }

     

       14
       16
        
       

     

       15
       17
        
       interface VTTSegment {

     

       16
       18
        
       	start: number;

     

       17
       19
        
       	end: number;

     

       18
       20
        
       	text: string;

     

       21
       21
       +
       	index?: string;

     

       19
       22
        
       }

     

       20
       23
        
       

     

       21
       24
        
       

     
···

       26
       29
        
       

     

       27
       30
        
       	let i = 0;

     

       28
       31
        
       	// Skip WEBVTT header

     

       29
       29
       -
       	while (i < lines.length && !lines[i]?.includes("-->")) {

     

       32
       32
       +
       	while (i < lines.length && lines[i]?.trim() !== "WEBVTT") {

     

       30
       33
        
       		i++;

     

       31
       34
        
       	}

     

       35
       35
       +
       	i++; // Skip WEBVTT

     

       32
       36
        
       

     

       33
       37
        
       	while (i < lines.length) {

     

       34
       34
       -
       		const line = lines[i];

     

       35
       35
       -
       		if (line?.includes("-->")) {

     

       36
       36
       -
       			const [startStr, endStr] = line.split("-->").map((s) => s.trim());

     

       38
       38
       +
       		let index: string | undefined;

     

       39
       39
       +
       		// Check for cue ID (line before timestamp)

     

       40
       40
       +
       		if (lines[i]?.trim() && !lines[i]?.includes("-->")) {

     

       41
       41
       +
       			index = lines[i]?.trim();

     

       42
       42
       +
       			i++;

     

       43
       43
       +
       		}

     

       44
       44
       +
       

     

       45
       45
       +
       		if (i < lines.length && lines[i]?.includes("-->")) {

     

       46
       46
       +
       			const [startStr, endStr] = lines[i].split("-->").map((s) => s.trim());

     

       37
       47
        
       			const start = parseVTTTimestamp(startStr || "");

     

       38
       48
        
       			const end = parseVTTTimestamp(endStr || "");

     

       39
       49
        
       

     
···

       49
       59
        
       				start,

     

       50
       60
        
       				end,

     

       51
       61
        
       				text: textLines.join(" ").trim(),

     

       62
       62
       +
       				index,

     

       52
       63
        
       			});

     

       64
       64
       +
       		} else {

     

       65
       65
       +
       			i++;

     

       53
       66
        
       		}

     

       54
       54
       -
       		i++;

     

       55
       67
        
       	}

     

       56
       68
        
       

     

       57
       69
        
       	return segments;

     
···

       305
       317
        
             border-radius: 2px;

     

       306
       318
        
           }

     

       307
       319
        
       

     

       320
       320
       +
           .paragraph {

     

       321
       321
       +
             display: block;

     

       322
       322
       +
             margin: 0 0 1rem 0;

     

       323
       323
       +
             line-height: 1.6;

     

       324
       324
       +
           }

     

       325
       325
       +
       

     

       308
       326
        
           .audio-player {

     

       309
       327
        
             margin-top: 1rem;

     

       310
       328
        
             width: 100%;

     
···

       546
       564
        
       			if (response.ok) {

     

       547
       565
        
       				const vttContent = await response.text();

     

       548
       566
        
       				const segments = parseVTT(vttContent);

     

       549
       549
       -
       				

     

       550
       550
       -
       				// Update job with VTT segments

     

       567
       567
       +
       

     

       568
       568
       +
       				// Update job with VTT content and segments

     

       551
       569
        
       				const job = this.jobs.find((j) => j.id === jobId);

     

       552
       570
        
       				if (job) {

     

       571
       571
       +
       					job.vttContent = vttContent;

     

       553
       572
        
       					job.vttSegments = segments;

     

       554
       573
        
       					job.audioUrl = `/api/transcriptions/${jobId}/audio`;

     

       555
       574
        
       					this.jobs = [...this.jobs];

     
···

       656
       675
        
       			"audio/wav", // WAV

     

       657
       676
        
       			"audio/x-wav", // WAV (alternative)

     

       658
       677
        
       			"audio/m4a", // M4A

     

       678
       678
       +
       			"audio/x-m4a", // M4A (alternative)

     

       659
       679
        
       			"audio/mp4", // MP4 audio

     

       660
       680
        
       			"audio/aac", // AAC

     

       661
       681
        
       			"audio/ogg", // OGG

     
···

       716
       736
        
       	}

     

       717
       737
        
       

     

       718
       738
        
       	private renderTranscript(job: TranscriptionJob) {

     

       719
       719
       -
       		if (!job.vttSegments) {

     

       739
       739
       +
       		if (!job.vttContent) {

     

       720
       740
        
       			const displayed = this.displayedTranscripts.get(job.id) || "";

     

       721
       741
        
       			return displayed;

     

       722
       742
        
       		}

     

       723
       743
        
       

     

       724
       724
       -
       		const segments = job.vttSegments;

     

       725
       725
       -
       		// Render segments as clickable spans

     

       726
       726
       -
       		return html`${segments.map(

     

       727
       727
       -
       			(segment, idx) => html`<span

     

       728
       728
       -
                 class="segment"

     

       729
       729
       -
                 data-start="${segment.start}"

     

       730
       730
       -
                 data-end="${segment.end}"

     

       731
       731
       -
               >${segment.text}</span>${idx < segments.length - 1 ? " " : ""}`,

     

       732
       732
       -
       		)}`;

     

       744
       744
       +
       		const segments = parseVTT(job.vttContent);

     

       745
       745
       +
       		// Group segments by paragraph (extract paragraph number from ID like "Paragraph 1-1" -> "1")

     

       746
       746
       +
       		const paragraphGroups = new Map<string, typeof segments>();

     

       747
       747
       +
       		for (const segment of segments) {

     

       748
       748
       +
       		const id = (segment.index || '').trim();

     

       749
       749
       +
       		const match = id.match(/^Paragraph\s+(\d+)-/);

     

       750
       750
       +
       		const paraNum = match ? match[1] : '0';

     

       751
       751
       +
       		if (!paragraphGroups.has(paraNum)) {

     

       752
       752
       +
       		paragraphGroups.set(paraNum, []);

     

       753
       753
       +
       		}

     

       754
       754
       +
       		paragraphGroups.get(paraNum)!.push(segment);

     

       755
       755
       +
       		}

     

       756
       756
       +
       

     

       757
       757
       +
       		// Render each paragraph group

     

       758
       758
       +
       		const paragraphs = Array.from(paragraphGroups.entries()).map(([paraNum, groupSegments]) => {

     

       759
       759
       +
       		// Concatenate all text in the group

     

       760
       760
       +
       		const fullText = groupSegments.map(s => s.text || '').join(' ');

     

       761
       761
       +
       		// Split into sentences

     

       762
       762
       +
       		const sentences = fullText.split(/(?<=[\.\!\?])\s+/g).filter(Boolean);

     

       763
       763
       +
       		// Calculate word counts for timing

     

       764
       764
       +
       		const wordCounts = sentences.map((s) => s.split(/\s+/).filter(Boolean).length);

     

       765
       765
       +
       		 const totalWords = Math.max(1, wordCounts.reduce((a, b) => a + b, 0));

     

       766
       766
       +
       

     

       767
       767
       +
       			// Overall paragraph timing

     

       768
       768
       +
       			const paraStart = Math.min(...groupSegments.map(s => s.start ?? 0));

     

       769
       769
       +
       			const paraEnd = Math.max(...groupSegments.map(s => s.end ?? paraStart));

     

       770
       770
       +
       

     

       771
       771
       +
       			let acc = 0;

     

       772
       772
       +
       			const paraDuration = paraEnd - paraStart;

     

       773
       773
       +
       

     

       774
       774
       +
       			return html`<div class="paragraph">

     

       775
       775
       +
       			${sentences.map((sent, si) => {

     

       776
       776
       +
       			const startOffset = (acc / totalWords) * paraDuration;

     

       777
       777
       +
       			acc += wordCounts[si];

     

       778
       778
       +
       			const sentenceDuration = (wordCounts[si] / totalWords) * paraDuration;

     

       779
       779
       +
       			const endOffset = si < sentences.length - 1 ? startOffset + sentenceDuration - 0.001 : paraEnd - paraStart;

     

       780
       780
       +
       			const spanStart = paraStart + startOffset;

     

       781
       781
       +
       			const spanEnd = paraStart + endOffset;

     

       782
       782
       +
       			 return html`<span class="segment" data-start="${spanStart}" data-end="${spanEnd}">${sent}</span>${si < sentences.length - 1 ? ' ' : ''}`;

     

       783
       783
       +
       			 })}

     

       784
       784
       +
       			</div>`;

     

       785
       785
       +
       		});

     

       786
       786
       +
       

     

       787
       787
       +
       		return html`${paragraphs}`;

     

       733
       788
        
       	}

     

       734
       789
        
       

     

       735
       790

+78 -37

src/lib/transcript-cleaner.test.ts

···

       1
       1
        
       import { test, expect } from "bun:test";

     

       2
       2
       -
       import { cleanTranscript } from "./transcript-cleaner";

     

       2
       2
       +
       import { cleanAndGetParagraphBoundaries } from "./transcript-cleaner";

     

       3
       3
       +
       

     

       4
       4
       +
       test("cleanAndGetParagraphBoundaries cleans transcript and returns paragraph boundaries", async () => {

     

       5
       5
       +
       // Use a longer, more realistic transcript sample with natural paragraph breaks

     

       6
       6
       +
       const rawTranscript = `[SIDE CONVERSATION] Today in chapel we are talking about the fact that we believe in having gospel conversations. I'm gonna run my own PowerPoint. I'm gonna jump around. It's gonna be a little more conversational than normal. It's not gonna be like one of the normal sermons, although I know me and my tendency it'll turn into a sermon at some point just because that's the way God made me, so I can't help it.

     

       7
       7
       +
       

     

       8
       8
       +
       Alright, so when it starts just have fun with it. We'll go on. Here's what it says in our doctrinal statement. It says, "Due to the commission of Christ and the urgency of the Gospel, all believers are to engage in Gospel conversations." How many of you believe that? That's pretty weak. How many of you believe that?

     

       3
       9
        
       

     

       4
       4
       -
       test("cleanTranscript removes tags and fixes grammar", async () => {

     

       5
       5
       -
       	const rawTranscript = `[SIDE CONVERSATION] Yes? So with this course packet, what quiz is and exams, and if I can study through here, what you talk about? And I give you a good review every time. Yeah, so I'd be good to just study that and then we can do it. Yeah, and all the examples and stuff that we get from class especially. And then I, like your first quiz, I give you a mock quiz exactly like the quiz. Oh, okay. so you can kind of get a feel for how I do things. [inaudible] Okay? [inaudible] Yeah. [background chatter]`;

     

       10
       10
       +
       To live God-honoring lives and to work continuously for the spread of the Gospel to their neighbors and the nations. Now, let's be honest, as we start off this morning, all of us could do a better job with personal evangelism, and all of us could do a better job with a heart for missions.

     

       6
       11
        
       

     

       7
       7
       -
       	const result = await cleanTranscript({

     

       8
       8
       -
       		transcriptId: "test-123",

     

       9
       9
       -
       		rawTranscript,

     

       10
       10
       -
       	});

     

       12
       12
       +
       So I'm not up here talking to you about something I have conquered or mastered. I'm not the expert on this. In fact, when it comes to personal evangelism in my own strength, I'm often a complete failure. But I have found that even in my weakness, God can use me in powerful ways when I make myself available to Him.`;

     

       11
       13
        
       

     

       12
       12
       -
       	// Check that tags are removed

     

       13
       13
       -
       	expect(result.cleanedTranscript).not.toContain("[SIDE CONVERSATION]");

     

       14
       14
       -
       	expect(result.cleanedTranscript).not.toContain("[inaudible]");

     

       15
       15
       -
       	expect(result.cleanedTranscript).not.toContain("[background chatter]");

     

       14
       14
       +
       // Create mock segments from raw transcript (simulating whisper output)

     

       15
       15
       +
       const sentences = rawTranscript.split(/\.\s+/);

     

       16
       16
       +
       const mockSegments: { index?: number; start?: number; end?: number; text: string }[] = [];

     

       17
       17
       +
       let timeOffset = 0;

     

       18
       18
       +
       for (let i = 0; i < sentences.length; i++) {

     

       19
       19
       +
       const sentence = sentences[i]?.trim();

     

       20
       20
       +
       if (!sentence) continue;

     

       21
       21
       +
       const duration = sentence.split(/\s+/).length * 0.3; // ~0.3s per word

     

       22
       22
       +
       mockSegments.push({

     

       23
       23
       +
       index: i,

     

       24
       24
       +
       start: timeOffset,

     

       25
       25
       +
       end: timeOffset + duration,

     

       26
       26
       +
       text: sentence,

     

       27
       27
       +
       });

     

       28
       28
       +
       timeOffset += duration;

     

       29
       29
       +
       }

     

       30
       30
       +
       

     

       31
       31
       +
       const result = await cleanAndGetParagraphBoundaries({

     

       32
       32
       +
       transcriptId: "test-123",

     

       33
       33
       +
       rawTranscript,

     

       34
       34
       +
       segments: mockSegments,

     

       35
       35
       +
       maxWordsMove: 3,

     

       36
       36
       +
       });

     

       37
       37
       +
       

     

       38
       38
       +
       // Check that we got a result

     

       39
       39
       +
       expect(result.paragraphs).toBeDefined();

     

       40
       40
       +
       expect(result.paragraphs!.length).toBeGreaterThan(1); // Should have multiple paragraphs

     

       41
       41
       +
       

     

       42
       42
       +
       // Check that paragraphs have the expected structure

     

       43
       43
       +
       for (const para of result.paragraphs!) {

     

       44
       44
       +
       		expect(para).toHaveProperty('startSegmentIndex');

     

       45
       45
       +
        expect(para).toHaveProperty('endSegmentIndex');

     

       46
       46
       +
        expect(para).toHaveProperty('text');

     

       47
       47
       +
        expect(para.text.length).toBeGreaterThan(0);

     

       48
       48
       +
       }

     

       49
       49
       +
       

     

       50
       50
       +
       // The cleaned text should have tags removed

     

       51
       51
       +
       const cleanedText = result.paragraphs!.map(p => p.text).join(' ');

     

       16
       52
        
       

     

       17
       17
       -
       	// Check that we got some text back

     

       18
       18
       -
       	expect(result.cleanedTranscript.length).toBeGreaterThan(0);

     

       19
       19
       -
       	expect(result.cleanedTranscript.length).toBeLessThan(rawTranscript.length);

     

       53
       53
       +
       expect(cleanedText).not.toContain("[SIDE CONVERSATION]");

     

       54
       54
       +
       expect(cleanedText.toLowerCase()).toContain("gospel");

     

       55
       55
       +
       expect(cleanedText.toLowerCase()).toContain("evangelism");

     

       20
       56
        
       

     

       21
       21
       -
       	console.log("Original:", rawTranscript.substring(0, 100));

     

       22
       22
       -
       	console.log("Cleaned:", result.cleanedTranscript.substring(0, 100));

     

       57
       57
       +
       	console.log(`Detected ${result.paragraphs!.length} paragraphs from ${mockSegments.length} segments`);

     

       58
       58
       +
       	console.log("First paragraph:", result.paragraphs![0]?.text.substring(0, 100) + "...");

     

       59
       59
       +
       	console.log("Last paragraph:", result.paragraphs![result.paragraphs!.length - 1]?.text.substring(0, 100) + "...");

     

       23
       60
        
       }, 30000); // 30s timeout for API call

     

       24
       61
        
       

     

       25
       25
       -
       test("cleanTranscript handles empty transcript", async () => {

     

       26
       26
       -
       	const result = await cleanTranscript({

     

       27
       27
       -
       		transcriptId: "test-empty",

     

       28
       28
       -
       		rawTranscript: "",

     

       29
       29
       -
       	});

     

       62
       62
       +
       test("cleanAndGetParagraphBoundaries handles empty transcript", async () => {

     

       63
       63
       +
       const result = await cleanAndGetParagraphBoundaries({

     

       64
       64
       +
       transcriptId: "test-empty",

     

       65
       65
       +
       rawTranscript: "",

     

       66
       66
       +
       segments: [],

     

       67
       67
       +
       maxWordsMove: 3,

     

       68
       68
       +
       });

     

       30
       69
        
       

     

       31
       31
       -
       	expect(result.cleanedTranscript).toBe("");

     

       70
       70
       +
       expect(result.paragraphs).toEqual([]);

     

       32
       71
        
       });

     

       33
       72
        
       

     

       34
       34
       -
       test("cleanTranscript falls back to raw transcript on API error", async () => {

     

       35
       35
       -
       	const rawTranscript = "Test transcript";

     

       73
       73
       +
       test("cleanAndGetParagraphBoundaries returns error on missing API key", async () => {

     

       74
       74
       +
       const rawTranscript = "Test transcript";

     

       36
       75
        
       

     

       37
       37
       -
       	// Test with missing API key (if it's actually set, this test might fail)

     

       38
       38
       -
       	const originalKey = process.env.GEMINI_API_KEY;

     

       39
       39
       -
       	delete process.env.GEMINI_API_KEY;

     

       76
       76
       +
       // Test with missing API key (if it's actually set, this test might fail)

     

       77
       77
       +
       const originalKey = process.env.OPENROUTER_API_KEY;

     

       78
       78
       +
       delete process.env.OPENROUTER_API_KEY;

     

       40
       79
        
       

     

       41
       41
       -
       	const result = await cleanTranscript({

     

       42
       42
       -
       		transcriptId: "test-fallback",

     

       43
       43
       -
       		rawTranscript,

     

       44
       44
       -
       	});

     

       80
       80
       +
       const result = await cleanAndGetParagraphBoundaries({

     

       81
       81
       +
       transcriptId: "test-fallback",

     

       82
       82
       +
       rawTranscript,

     

       83
       83
       +
       segments: [{ text: rawTranscript }],

     

       84
       84
       +
       maxWordsMove: 3,

     

       85
       85
       +
       });

     

       45
       86
        
       

     

       46
       46
       -
       	expect(result.cleanedTranscript).toBe(rawTranscript);

     

       47
       47
       -
       	expect(result.error).toBe("GEMINI_API_KEY not set");

     

       87
       87
       +
       expect(result.paragraphs).toBeUndefined();

     

       88
       88
       +
       expect(result.error).toBe("OPENROUTER_API_KEY not set");

     

       48
       89
        
       

     

       49
       49
       -
       	// Restore key

     

       50
       50
       -
       	if (originalKey) {

     

       51
       51
       -
       		process.env.GEMINI_API_KEY = originalKey;

     

       52
       52
       -
       	}

     

       90
       90
       +
       // Restore key

     

       91
       91
       +
       if (originalKey) {

     

       92
       92
       +
       process.env.OPENROUTER_API_KEY = originalKey;

     

       93
       93
       +
       }

     

       53
       94
        
       });

+91 -84

src/lib/transcript-cleaner.ts

···

       1
       1
       -
       // Clean up transcripts using Gemini to remove tags and fix grammar

     

       2
       2
       -
       

     

       3
       3
       -
       interface CleanTranscriptOptions {

     

       4
       4
       -
       	transcriptId: string;

     

       5
       5
       -
       	rawTranscript: string;

     

       6
       6
       -
       }

     

       7
       7
       -
       

     

       8
       8
       -
       interface CleanTranscriptResult {

     

       9
       9
       -
       	cleanedTranscript: string;

     

       10
       10
       -
       	error?: string;

     

       1
       1
       +
       // Paragraph boundary detection using OpenRouter. Returns a JSON array of paragraph objects.

     

       2
       2
       +
       export interface ParagraphBoundary {

     

       3
       3
       +
       	startSegmentIndex: number;

     

       4
       4
       +
       	endSegmentIndex: number;

     

       5
       5
       +
       	text: string;

     

       6
       6
       +
       	// Optional: list of moved words for auditing

     

       7
       7
       +
       	movedWords?: { word: string; fromSegmentIndex: number; toSegmentIndex: number }[];

     

       11
       8
        
       }

     

       12
       9
        
       

     

       13
       13
       -
       /**

     

       14
       14
       -
        * Clean transcript using Gemini Flash 2.0 (cheapest model)

     

       15
       15
       -
        * Removes tags like [SIDE CONVERSATION], [inaudible], etc.

     

       16
       16
       -
        * Fixes grammar while preserving sentence structure

     

       17
       17
       -
        */

     

       18
       18
       -
       export async function cleanTranscript({

     

       10
       10
       +
       // Cleans transcript and determines paragraph boundaries in one LLM request.

     

       11
       11
       +
       // Returns paragraph boundaries as JSON array.

     

       12
       12
       +
       export async function cleanAndGetParagraphBoundaries({

     

       19
       13
        
       	transcriptId,

     

       20
       14
        
       	rawTranscript,

     

       21
       21
       -
       }: CleanTranscriptOptions): Promise<CleanTranscriptResult> {

     

       22
       22
       -
       	const apiKey = process.env.GEMINI_API_KEY;

     

       15
       15
       +
       	segments,

     

       16
       16
       +
       	maxWordsMove = 0,

     

       17
       17
       +
       }: {

     

       18
       18
       +
       	transcriptId: string;

     

       19
       19
       +
       	rawTranscript: string;

     

       20
       20
       +
       	segments: { index?: number; start?: number; end?: number; text: string }[];

     

       21
       21
       +
       	maxWordsMove?: number;

     

       22
       22
       +
       }): Promise<{ paragraphs?: ParagraphBoundary[]; error?: string }> {

     

       23
       23
       +
       	// Skip processing if transcript is empty

     

       24
       24
       +
       	if (!rawTranscript || rawTranscript.trim().length === 0) {

     

       25
       25
       +
       		return { paragraphs: [] };

     

       26
       26
       +
       	}

     

       23
       27
        
       

     

       28
       28
       +
       	const apiKey = process.env.OPENROUTER_API_KEY;

     

       29
       29
       +
       	const model = process.env.OPENROUTER_MODEL || "openrouter/polaris-alpha";

     

       24
       30
        
       	if (!apiKey) {

     

       25
       25
       -
       		return {

     

       26
       26
       -
       			cleanedTranscript: rawTranscript,

     

       27
       27
       -
       			error: "GEMINI_API_KEY not set",

     

       28
       28
       -
       		};

     

       31
       31
       +
       		return { error: "OPENROUTER_API_KEY not set" };

     

       29
       32
        
       	}

     

       30
       33
        
       

     

       31
       31
       -
       	// Skip cleaning if transcript is empty

     

       32
       32
       -
       	if (!rawTranscript || rawTranscript.trim().length === 0) {

     

       33
       33
       -
       		return {

     

       34
       34
       -
       			cleanedTranscript: rawTranscript,

     

       35
       35
       -
       		};

     

       36
       36
       -
       	}

     

       34
       34
       +
       	try {

     

       35
       35
       +
       		const segmentsPayload = segments.map((s) => ({

     

       36
       36
       +
       			index: s.index ?? null,

     

       37
       37
       +
       			start: s.start ?? null,

     

       38
       38
       +
       			end: s.end ?? null,

     

       39
       39
       +
       			text: s.text ?? "",

     

       40
       40
       +
       		}));

     

       37
       41
        
       

     

       38
       38
       -
       	console.log(

     

       39
       39
       -
       		`[TranscriptCleaner] Starting cleanup for ${transcriptId} (${rawTranscript.length} chars)`,

     

       40
       40
       -
       	);

     

       42
       42
       +
       		const prompt = `You are a transcript editor and paragrapher. Input: a list of original transcript segments with their index, start time (seconds), end time (seconds), and the RAW transcript text.

     

       41
       43
        
       

     

       42
       42
       -
       	try {

     

       43
       43
       -
       		const prompt = `You are a transcript editor. Clean up this transcript by:

     

       44
       44
       +
       Your task: First, clean the transcript by:

     

       44
       45
        
       1. Removing ALL tags like [SIDE CONVERSATION], [inaudible], [background chatter], etc.

     

       45
       46
        
       2. Fixing grammar and punctuation to make sentences readable

     

       46
       47
        
       3. Preserving the original sentence structure and wording as much as possible

     
···

       48
       49
        
       5. NOT adding any new content or changing the meaning

     

       49
       50
        
       6. If there are obvious speaking mistakes then you can fix those (e.g. "we are going no wait sorry you should be doing")

     

       50
       51
        
       

     

       51
       51
       -
       Return ONLY the cleaned transcript text, nothing else.

     

       52
       52
       +
       Then, determine paragraph boundaries by grouping the cleaned segments into logical paragraphs. A paragraph represents a complete thought, topic, or idea. Create MULTIPLE paragraphs based on:

     

       53
       53
       +
       - Natural topic changes or shifts in the speaker's focus

     

       54
       54
       +
       - Pauses or transitions in the speech ("Now...", "So...", "Let me tell you...", "Alright...")

     

       55
       55
       +
       - Complete narrative beats or examples

     

       56
       56
       +
       - Typical spoken paragraph length (30-120 seconds / 5-20 segments)

     

       57
       57
       +
       

     

       58
       58
       +
       CRITICAL: Each paragraph MUST end with a complete sentence. DO NOT break paragraphs mid-sentence.

     

       59
       59
       +
       

     

       60
       60
       +
       RETURN ONLY a JSON array of objects, EXACTLY in this format (no additional text):

     

       61
       61
       +
       

     

       62
       62
       +
       [ {"startSegmentIndex": <int>, "endSegmentIndex": <int>, "text": "<paragraph text>"}, ... ]

     

       63
       63
       +
       

     

       64
       64
       +
       Rules for paragraphing:

     

       65
       65
       +
       - ALWAYS end paragraphs at sentence boundaries (after periods, question marks, or exclamation points)

     

       66
       66
       +
       - NEVER break a paragraph in the middle of a sentence

     

       67
       67
       +
       - Create AT LEAST one paragraph for every 30-60 seconds of speech (roughly 5-10 segments)

     

       68
       68
       +
       - DO NOT put the entire transcript in a single paragraph

     

       69
       69
       +
       - Paragraphs must reference original segment indexes

     

       70
       70
       +
       - Do not move words across segment boundaries

     

       71
       71
       +
       - Return the paragraphs in order and cover the entire cleaned transcript text without overlap or omission

     

       72
       72
       +
       

     

       73
       73
       +
       Segments:

     

       74
       74
       +
       ${JSON.stringify(segmentsPayload, null, 2)}

     

       52
       75
        
       

     

       53
       53
       -
       Transcript to clean:

     

       76
       76
       +
       Raw Transcript:

     

       54
       77
        
       ${rawTranscript}`;

     

       55
       78
        
       

     

       56
       79
        
       		const response = await fetch(

     

       57
       57
       -
       			"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-exp:generateContent",

     

       80
       80
       +
       			"https://openrouter.ai/api/v1/chat/completions",

     

       58
       81
        
       			{

     

       59
       82
        
       				method: "POST",

     

       60
       83
        
       				headers: {

     

       61
       84
        
       					"Content-Type": "application/json",

     

       62
       62
       -
       					"x-goog-api-key": apiKey,

     

       85
       85
       +
       					"Authorization": `Bearer ${apiKey}`,

     

       86
       86
       +
       					"HTTP-Referer": "https://thistle.app",

     

       87
       87
       +
       					"X-Title": "Thistle Transcription",

     

       63
       88
        
       				},

     

       64
       89
        
       				body: JSON.stringify({

     

       65
       65
       -
       					contents: [

     

       66
       66
       -
       						{

     

       67
       67
       -
       							parts: [{ text: prompt }],

     

       68
       68
       -
       						},

     

       90
       90
       +
       					model,

     

       91
       91
       +
       					messages: [

     

       92
       92
       +
       						{ role: "user", content: prompt },

     

       69
       93
        
       					],

     

       70
       70
       -
       					generationConfig: {

     

       71
       71
       -
       						temperature: 0.3,

     

       72
       72
       -
       						topK: 40,

     

       73
       73
       -
       						topP: 0.95,

     

       74
       74
       -
       						maxOutputTokens: 8192,

     

       75
       75
       -
       					},

     

       94
       94
       +
       					temperature: 0.0,

     

       95
       95
       +
       					max_tokens: 8192,

     

       76
       96
        
       				}),

     

       77
       97
        
       			},

     

       78
       98
        
       		);

     

       79
       99
        
       

     

       80
       100
        
       		if (!response.ok) {

     

       81
       101
        
       			const errorText = await response.text();

     

       82
       82
       -
       			console.error(

     

       83
       83
       -
       				`[TranscriptCleaner] Gemini API error for ${transcriptId}:`,

     

       84
       84
       -
       				errorText,

     

       85
       85
       -
       			);

     

       86
       86
       -
       			return {

     

       87
       87
       -
       				cleanedTranscript: rawTranscript,

     

       88
       88
       -
       				error: `Gemini API error: ${response.status}`,

     

       89
       89
       -
       			};

     

       102
       102
       +
       			console.error(`[Paragrapher] OpenRouter error for ${transcriptId}:`, errorText);

     

       103
       103
       +
       			return { error: `OpenRouter API error: ${response.status}` };

     

       90
       104
        
       		}

     

       91
       105
        
       

     

       92
       106
        
       		const result = await response.json();

     

       93
       93
       -
       		const cleanedText =

     

       94
       94
       -
       			result.candidates?.[0]?.content?.parts?.[0]?.text?.trim();

     

       107
       107
       +
       		const raw = result.choices?.[0]?.message?.content?.trim();

     

       108
       108
       +
       		if (!raw) {

     

       109
       109
       +
       			return { error: "Empty paragrapher response" };

     

       110
       110
       +
       		}

     

       95
       111
        
       

     

       96
       96
       -
       		if (!cleanedText) {

     

       97
       97
       -
       			console.warn(

     

       98
       98
       -
       				`[TranscriptCleaner] Empty response from Gemini for ${transcriptId}`,

     

       99
       99
       -
       			);

     

       100
       100
       -
       			return {

     

       101
       101
       -
       				cleanedTranscript: rawTranscript,

     

       102
       102
       -
       				error: "Empty response from Gemini",

     

       103
       103
       -
       			};

     

       112
       112
       +
       		let parsed: ParagraphBoundary[] | null = null;

     

       113
       113
       +
       		try {

     

       114
       114
       +
       			parsed = JSON.parse(raw) as ParagraphBoundary[];

     

       115
       115
       +
       		} catch (e) {

     

       116
       116
       +
       			// Attempt to extract JSON substring if model padded text

     

       117
       117
       +
       			const firstBracket = raw.indexOf("[");

     

       118
       118
       +
       			const lastBracket = raw.lastIndexOf("]");

     

       119
       119
       +
       			if (firstBracket >= 0 && lastBracket > firstBracket) {

     

       120
       120
       +
       				const substr = raw.substring(firstBracket, lastBracket + 1);

     

       121
       121
       +
       				parsed = JSON.parse(substr) as ParagraphBoundary[];

     

       122
       122
       +
       			}

     

       104
       123
        
       		}

     

       105
       124
        
       

     

       106
       106
       -
       		const reduction = Math.round(

     

       107
       107
       -
       			((rawTranscript.length - cleanedText.length) / rawTranscript.length) *

     

       108
       108
       -
       				100,

     

       109
       109
       -
       		);

     

       110
       110
       -
       		console.log(

     

       111
       111
       -
       			`[TranscriptCleaner] Completed for ${transcriptId}: ${rawTranscript.length} → ${cleanedText.length} chars (${reduction}% reduction)`,

     

       112
       112
       -
       		);

     

       125
       125
       +
       		if (!parsed || !Array.isArray(parsed)) {

     

       126
       126
       +
       			return { error: "Failed to parse paragrapher JSON" };

     

       127
       127
       +
       		}

     

       113
       128
        
       

     

       114
       114
       -
       		return {

     

       115
       115
       -
       			cleanedTranscript: cleanedText,

     

       116
       116
       -
       		};

     

       117
       117
       -
       	} catch (error) {

     

       118
       118
       -
       		console.error(

     

       119
       119
       -
       			`[TranscriptCleaner] Failed to clean ${transcriptId}:`,

     

       120
       120
       -
       			error,

     

       121
       121
       -
       		);

     

       122
       122
       -
       		return {

     

       123
       123
       -
       			cleanedTranscript: rawTranscript,

     

       124
       124
       -
       			error: error instanceof Error ? error.message : "Unknown error",

     

       125
       125
       -
       		};

     

       129
       129
       +
       		return { paragraphs: parsed };

     

       130
       130
       +
       	} catch (err) {

     

       131
       131
       +
       		console.error("[Paragrapher] Exception:", err);

     

       132
       132
       +
       		return { error: err instanceof Error ? err.message : "Unknown error" };

     

       126
       133
        
       	}

     

       127
       134
        
       }

+13 -8

src/lib/transcription.ts

···

       3
       3
        
       import { ErrorCode } from "./errors";

     

       4
       4
        
       import { saveTranscriptVTT } from "./transcript-storage";

     

       5
       5
        
       import { cleanVTT } from "./vtt-cleaner";

     

       6
       6
       +
       import { parseVTT } from "./vtt-cleaner";

     

       6
       7
        
       

     

       7
       8
        
       // Constants

     

       8
       9
        
       export const MAX_FILE_SIZE = 100 * 1024 * 1024; // 100MB

     
···

       300
       301
        
       						`${this.serviceUrl}/transcribe/${whisperJobId}?format=vtt`,

     

       301
       302
        
       					);

     

       302
       303
        
       					if (vttResponse.ok) {

     

       303
       303
       -
       						const vttContent = await vttResponse.text();

     

       304
       304
       -
       						const cleanedVTT = await cleanVTT(transcriptionId, vttContent);

     

       305
       305
       -
       						await saveTranscriptVTT(transcriptionId, cleanedVTT);

     

       306
       306
       -
       					}

     

       304
       304
       +
       					const vttContent = await vttResponse.text();

     

       305
       305
       +
       					const cleanedVTT = await cleanVTT(transcriptionId, vttContent);

     

       306
       306
       +
       					await saveTranscriptVTT(transcriptionId, cleanedVTT);

     

       307
       307
       +
       					this.updateTranscription(transcriptionId, {});

     

       308
       308
       +
       				}

     

       307
       309
        
       				} catch (error) {

     

       308
       310
        
       					console.warn(

     

       309
       311
        
       						`[Transcription] Failed to fetch VTT for ${transcriptionId}:`,

     
···

       361
       363
        
       			status?: TranscriptionStatus;

     

       362
       364
        
       			progress?: number;

     

       363
       365
        
       			error_message?: string;

     

       366
       366
       +
       			vttContent?: string;

     

       364
       367
        
       		},

     

       365
       368
        
       	) {

     

       366
       369
        
       		const updates: string[] = [];

     
···

       378
       381
        
       			updates.push("error_message = ?");

     

       379
       382
        
       			values.push(data.error_message);

     

       380
       383
        
       		}

     

       384
       384
       +
       

     

       381
       385
        
       

     

       382
       386
        
       		updates.push("updated_at = ?");

     

       383
       387
        
       		values.push(Math.floor(Date.now() / 1000));

     
···

       519
       523
        
       						`${this.serviceUrl}/transcribe/${whisperJob.id}?format=vtt`,

     

       520
       524
        
       					);

     

       521
       525
        
       					if (vttResponse.ok) {

     

       522
       522
       -
       						const vttContent = await vttResponse.text();

     

       523
       523
       -
       						const cleanedVTT = await cleanVTT(transcriptionId, vttContent);

     

       524
       524
       -
       						await saveTranscriptVTT(transcriptionId, cleanedVTT);

     

       525
       525
       -
       					}

     

       526
       526
       +
       					const vttContent = await vttResponse.text();

     

       527
       527
       +
       					const cleanedVTT = await cleanVTT(transcriptionId, vttContent);

     

       528
       528
       +
       					await saveTranscriptVTT(transcriptionId, cleanedVTT);

     

       529
       529
       +
       					this.updateTranscription(transcriptionId, {});

     

       530
       530
       +
       				}

     

       526
       531
        
       				} catch (error) {

     

       527
       532
        
       					console.warn(

     

       528
       533
        
       						`[Sync] Failed to fetch VTT for ${transcriptionId}:`,

+44

src/lib/vtt-cleaner.test.ts

···

       1
       1
        
       import { test, expect } from "bun:test";

     

       2
       2
        
       import { cleanVTT } from "./vtt-cleaner";

     

       3
       3
       +
       import { readFileSync } from "fs";

     

       4
       4
       +
       import { join } from "path";

     

       3
       5
        
       

     

       4
       6
        
       const sampleVTT = `WEBVTT

     

       5
       7
        
       

     
···

       36
       38
        
       

     

       37
       39
        
       	expect(result).toBe(emptyVTT);

     

       38
       40
        
       });

     

       41
       41
       +
       

     

       42
       42
       +
       test("cleanVTT detects multiple paragraphs", async () => {

     

       43
       43
       +
       	const multiParaVTT = `WEBVTT

     

       44
       44
       +
       

     

       45
       45
       +
       Paragraph 1-1

     

       46
       46
       +
       00:00:00.000 --> 00:00:00.000

     

       47
       47
       +
       Again, thank you for the privilege to not only study here, but also to teach here. Jesus,

     

       48
       48
       +
       

     

       49
       49
       +
       Paragraph 1-2

     

       50
       50
       +
       00:00:00.000 --> 00:00:00.000

     

       51
       51
       +
       thank you. All`;

     

       52
       52
       +
       

     

       53
       53
       +
       	const result = await cleanVTT("test-multi-para", multiParaVTT);

     

       54
       54
       +
       

     

       55
       55
       +
       	expect(result).toContain("Paragraph 1-1");

     

       56
       56
       +
       	expect(result).toContain("Paragraph 2-1");

     

       57
       57
       +
       	// Should have at least two paragraphs

     

       58
       58
       +
       	const paraMatches = result.match(/Paragraph \d+-\d+/g);

     

       59
       59
       +
       	expect(paraMatches?.length).toBeGreaterThan(1);

     

       60
       60
       +
       }, 30000);

     

       61
       61
       +
       

     

       62
       62
       +
       test("cleanVTT with real transcription data", async () => {

     

       63
       63
       +
       	const originalApiKey = process.env.OPENROUTER_API_KEY;

     

       64
       64
       +
       	// Temporarily unset to force fallback

     

       65
       65
       +
       	delete process.env.OPENROUTER_API_KEY;

     

       66
       66
       +
       

     

       67
       67
       +
       	try {

     

       68
       68
       +
       		const vttPath = join(__dirname, "../../transcripts/d69d8076-598a-4fe5-8100-fe3eff47fcd6.vtt");

     

       69
       69
       +
       		const realVTT = readFileSync(vttPath, "utf-8");

     

       70
       70
       +
       

     

       71
       71
       +
       		const result = await cleanVTT("real-test", realVTT);

     

       72
       72
       +
       

     

       73
       73
       +
       		expect(result).toContain("WEBVTT");

     

       74
       74
       +
       		// Check that it has multiple paragraph numbers

     

       75
       75
       +
       		const paraMatches = result.match(/Paragraph (\d+)-\d+/g);

     

       76
       76
       +
       		const uniqueParas = new Set(paraMatches?.map(m => m.match(/Paragraph (\d+)/)?.[1]));

     

       77
       77
       +
       		expect(uniqueParas.size).toBeGreaterThan(1);

     

       78
       78
       +
       		console.log("Paragraphs found:", uniqueParas.size);

     

       79
       79
       +
       	} finally {

     

       80
       80
       +
       		process.env.OPENROUTER_API_KEY = originalApiKey;

     

       81
       81
       +
       	}

     

       82
       82
       +
       }, 30000);

+147 -29

src/lib/vtt-cleaner.ts

···

       1
       1
        
       // Parse and clean VTT files

     

       2
       2
        
       

     

       3
       3
       -
       import { cleanTranscript } from "./transcript-cleaner";

     

       3
       3
       +
       import type { ParagraphBoundary } from "./transcript-cleaner";

     

       4
       4
        
       

     

       5
       5
        
       interface VTTSegment {

     

       6
       6
        
       	index?: number;

     

       7
       7
        
       	timestamp: string;

     

       8
       8
        
       	text: string;

     

       9
       9
       +
       	start?: number;

     

       10
       10
       +
       	end?: number;

     

       9
       11
        
       }

     

       10
       12
        
       

     

       11
       13
        
       /**

     

       12
       12
       -
        * Parse VTT content into segments

     

       14
       14
       +
        * Parse a VTT timestamp string (hh:mm:ss.mmm or mm:ss.mmm) into seconds

     

       13
       15
        
        */

     

       14
       14
       -
       function parseVTT(vttContent: string): VTTSegment[] {

     

       16
       16
       +
       function parseTimestampToSeconds(ts?: string): number {

     

       17
       17
       +
       	if (!ts) return 0;

     

       18
       18
       +
       	// ts expected like "00:00:09.039"

     

       19
       19
       +
       	const parts = ts.split(":").map((p) => p.trim());

     

       20
       20
       +
       	const hh = parts[0] ?? "0";

     

       21
       21
       +
       	const mm = parts[1] ?? "0";

     

       22
       22
       +
       	const ss = parts[2] ?? "0";

     

       23
       23
       +
       	if (parts.length === 3) {

     

       24
       24
       +
       		const seconds =

     

       25
       25
       +
       			parseInt(hh, 10) * 3600 + parseInt(mm, 10) * 60 + parseFloat(ss);

     

       26
       26
       +
       		return seconds;

     

       27
       27
       +
       	} else if (parts.length === 2) {

     

       28
       28
       +
       		return parseInt(mm, 10) * 60 + parseFloat(ss);

     

       29
       29
       +
       	}

     

       30
       30
       +
       	return 0;

     

       31
       31
       +
       }

     

       32
       32
       +
       

     

       33
       33
       +
       /**

     

       34
       34
       +
        * Parse VTT content into segments, populating start/end in seconds

     

       35
       35
       +
        */

     

       36
       36
       +
       export function parseVTT(vttContent: string): VTTSegment[] {

     

       15
       37
        
       	const lines = vttContent.split("\n");

     

       16
       38
        
       	const segments: VTTSegment[] = [];

     

       17
       39
        
       	let currentSegment: Partial<VTTSegment> = {};

     
···

       21
       43
        
       

     

       22
       44
        
       		if (!line) {

     

       23
       45
        
       			if (currentSegment.timestamp && currentSegment.text) {

     

       46
       46
       +
       				// parse start/end

     

       47
       47
       +
       				const match = /([\d:.]+)\s*-->\s*([\d:.]+)/.exec(

     

       48
       48
       +
       					currentSegment.timestamp || "",

     

       49
       49
       +
       				);

     

       50
       50
       +
       				if (match) {

     

       51
       51
       +
       					currentSegment.start = parseTimestampToSeconds(match[1]);

     

       52
       52
       +
       					currentSegment.end = parseTimestampToSeconds(match[2]);

     

       53
       53
       +
       				}

     

       24
       54
        
       				segments.push(currentSegment as VTTSegment);

     

       25
       55
        
       				currentSegment = {};

     

       26
       56
        
       			}

     
···

       31
       61
        
       			continue;

     

       32
       62
        
       		}

     

       33
       63
        
       

     

       64
       64
       +
       		// Check if it's a cue id (before timestamp)

     

       65
       65
       +
       		if (!currentSegment.timestamp && line && !line.includes("-->")) {

     

       66
       66
       +
       			currentSegment.index = line;

     

       67
       67
       +
       			continue;

     

       68
       68
       +
       		}

     

       69
       69
       +
       

     

       34
       70
        
       		// Check if it's a timestamp line

     

       35
       71
        
       		if (line.includes("-->")) {

     

       36
       72
        
       			currentSegment.timestamp = line;

     

       37
       73
        
       			// Next line(s) will be text

     

       38
       74
        
       			const textLines: string[] = [];

     

       39
       75
        
       			i++;

     

       40
       40
       -
       			while (i < lines.length && lines[i]?.trim() && !lines[i]?.includes("-->")) {

     

       76
       76
       +
       			while (

     

       77
       77
       +
       				i < lines.length &&

     

       78
       78
       +
       				lines[i]?.trim() &&

     

       79
       79
       +
       				!lines[i]?.includes("-->")

     

       80
       80
       +
       			) {

     

       41
       81
        
       				textLines.push(lines[i] || "");

     

       42
       82
        
       				i++;

     

       43
       83
        
       			}

     
···

       51
       91
        
       

     

       52
       92
        
       	// Add last segment if exists

     

       53
       93
        
       	if (currentSegment.timestamp && currentSegment.text) {

     

       94
       94
       +
       		const match = /([\d:.]+)\s*-->\s*([\d:.]+)/.exec(

     

       95
       95
       +
       			currentSegment.timestamp || "",

     

       96
       96
       +
       		);

     

       97
       97
       +
       		if (match?.[1] && match[2]) {

     

       98
       98
       +
       			currentSegment.start = parseTimestampToSeconds(match[1]);

     

       99
       99
       +
       			currentSegment.end = parseTimestampToSeconds(match[2]);

     

       100
       100
       +
       		}

     

       54
       101
        
       		segments.push(currentSegment as VTTSegment);

     

       55
       102
        
       	}

     

       56
       103
        
       

     
···

       58
       105
        
       }

     

       59
       106
        
       

     

       60
       107
        
       /**

     

       61
       61
       -
        * Clean VTT text segments by removing tags and fixing grammar

     

       108
       108
       +
        * Clean VTT text segments by removing tags and fixing grammar.

     

       109
       109
       +
        * Additionally, merge cleaned segments into paragraph cues while preserving

     

       110
       110
       +
        * stable paragraph IDs (derived from first segment start time).

     

       62
       111
        
        */

     

       63
       112
        
       export async function cleanVTT(

     

       64
       113
        
       	transcriptionId: string,

     
···

       71
       120
        
       	}

     

       72
       121
        
       

     

       73
       122
        
       	console.log(

     

       74
       74
       -
       		`[VTTCleaner] Cleaning ${segments.length} segments for ${transcriptionId}`,

     

       123
       123
       +
       		`[VTTCleaner] Processing ${segments.length} segments for ${transcriptionId}`,

     

       75
       124
        
       	);

     

       76
       125
        
       

     

       77
       77
       -
       	// Combine all text for cleaning

     

       126
       126
       +
       	// Combine all text for cleaning and paragraphing

     

       78
       127
        
       	const allText = segments.map((s) => s.text).join(" ");

     

       79
       128
        
       

     

       80
       80
       -
       	const { cleanedTranscript, error } = await cleanTranscript({

     

       81
       81
       -
       		transcriptId: transcriptionId,

     

       82
       82
       -
       		rawTranscript: allText,

     

       83
       83
       -
       	});

     

       129
       129
       +
       	// Attempt LLM-driven cleaning and paragraphing in one request, fallback to deterministic rules

     

       130
       130
       +
       	let paragraphBoundaries: ParagraphBoundary[] = [];

     

       84
       131
        
       

     

       85
       85
       -
       	if (error) {

     

       86
       86
       -
       		console.warn(`[VTTCleaner] Falling back to original VTT: ${error}`);

     

       87
       87
       -
       		return vttContent;

     

       132
       132
       +
       	try {

     

       133
       133
       +
       		const { cleanAndGetParagraphBoundaries } = await import(

     

       134
       134
       +
       			"./transcript-cleaner"

     

       135
       135
       +
       		);

     

       136
       136
       +
       		const result = await cleanAndGetParagraphBoundaries({

     

       137
       137
       +
       			transcriptId: transcriptionId,

     

       138
       138
       +
       			rawTranscript: allText,

     

       139
       139
       +
       			segments: segments.map((s) => ({

     

       140
       140
       +
       				index: s.index,

     

       141
       141
       +
       				start: s.start,

     

       142
       142
       +
       				end: s.end,

     

       143
       143
       +
       				text: s.text,

     

       144
       144
       +
       			})),

     

       145
       145
       +
       			maxWordsMove: 0,

     

       146
       146
       +
       		});

     

       147
       147
       +
       

     

       148
       148
       +
       		if (result?.paragraphs) {

     

       149
       149
       +
       			paragraphBoundaries = result.paragraphs;

     

       150
       150
       +
       		}

     

       151
       151
       +
       	} catch (e) {

     

       152
       152
       +
       		console.warn(

     

       153
       153
       +
       			"[VTTCleaner] Consolidated LLM failed, no paragraph detection:",

     

       154
       154
       +
       			e,

     

       155
       155
       +
       		);

     

       88
       156
        
       	}

     

       89
       157
        
       

     

       90
       90
       -
       	// Split cleaned text back into segments

     

       91
       91
       -
       	// Use simple word-based splitting proportional to original segment lengths

     

       92
       92
       -
       	const words = cleanedTranscript.split(/\s+/);

     

       93
       93
       -
       	const originalWords = allText.split(/\s+/);

     

       94
       94
       -
       	const ratio = words.length / originalWords.length;

     

       158
       158
       +
       	if (paragraphBoundaries.length === 0) {

     

       159
       159
       +
       		// No paragraphs detected, treat as one big paragraph

     

       160
       160
       +
       		paragraphBoundaries = [

     

       161
       161
       +
       			{

     

       162
       162
       +
       				startSegmentIndex: 0,

     

       163
       163
       +
       				endSegmentIndex: segments.length - 1,

     

       164
       164
       +
       				text: allText,

     

       165
       165
       +
       			},

     

       166
       166
       +
       		];

     

       167
       167
       +
       	}

     

       168
       168
       +
       

     

       169
       169
       +
       	// Get the full cleaned transcript from paragraphs

     

       170
       170
       +
       	const cleanedTranscript = paragraphBoundaries.map((p) => p.text).join(" ");

     

       171
       171
       +
       

     

       172
       172
       +
       	// Split cleaned text back into segments proportionally (word-based)

     

       173
       173
       +
       	const words = cleanedTranscript.split(/\s+/).filter(Boolean);

     

       174
       174
       +
       	const originalWords = allText.split(/\s+/).filter(Boolean);

     

       175
       175
       +
       	const ratio = words.length / Math.max(1, originalWords.length);

     

       95
       176
        
       

     

       96
       177
        
       	let wordIndex = 0;

     

       97
       178
        
       	const cleanedSegments: VTTSegment[] = [];

     

       98
       179
        
       

     

       99
       180
        
       	for (const segment of segments) {

     

       100
       100
       -
       		const originalWordCount = segment.text.split(/\s+/).length;

     

       181
       181
       +
       		const originalWordCount = Math.max(

     

       182
       182
       +
       			1,

     

       183
       183
       +
       			segment.text.split(/\s+/).filter(Boolean).length,

     

       184
       184
       +
       		);

     

       101
       185
        
       		const newWordCount = Math.max(1, Math.round(originalWordCount * ratio));

     

       102
       186
        
       		const segmentWords = words.slice(wordIndex, wordIndex + newWordCount);

     

       103
       187
        
       		wordIndex += newWordCount;

     

       104
       188
        
       

     

       105
       189
        
       		cleanedSegments.push({

     

       190
       190
       +
       			index: segment.index,

     

       106
       191
        
       			timestamp: segment.timestamp,

     

       107
       192
        
       			text: segmentWords.join(" "),

     

       108
       108
       -
       			index: segment.index,

     

       193
       193
       +
       			start: segment.start,

     

       194
       194
       +
       			end: segment.end,

     

       109
       195
        
       		});

     

       110
       196
        
       	}

     

       111
       197
        
       

     

       112
       112
       -
       	// Rebuild VTT

     

       113
       113
       -
       	let output = "WEBVTT\n\n";

     

       114
       114
       -
       	for (const segment of cleanedSegments) {

     

       115
       115
       -
       		if (segment.index !== undefined) {

     

       116
       116
       -
       			output += `${segment.index}\n`;

     

       198
       198
       +
       	// If any remaining words, append to last segment

     

       199
       199
       +
       	if (wordIndex < words.length && cleanedSegments.length > 0) {

     

       200
       200
       +
       		const rest = words.slice(wordIndex).join(" ");

     

       201
       201
       +
       		const lastIdx = cleanedSegments.length - 1;

     

       202
       202
       +
       		const lastSeg = cleanedSegments[lastIdx];

     

       203
       203
       +
       		if (lastSeg) {

     

       204
       204
       +
       			lastSeg.text += (lastSeg.text ? " " : "") + rest;

     

       117
       205
        
       		}

     

       118
       118
       -
       		output += `${segment.timestamp}\n`;

     

       119
       119
       -
       		output += `${segment.text}\n\n`;

     

       120
       206
        
       	}

     

       121
       207
        
       

     

       122
       122
       -
       	console.log(`[VTTCleaner] Completed for ${transcriptionId}`);

     

       208
       208
       +
       	// Assign paragraph-based IDs to segments

     

       209
       209
       +
       	for (let i = 0; i < cleanedSegments.length; i++) {

     

       210
       210
       +
       		const seg = cleanedSegments[i];

     

       211
       211
       +
       		if (!seg) continue;

     

       212
       212
       +
       

     

       213
       213
       +
       		// Find which paragraph this segment belongs to

     

       214
       214
       +
       		let paraIndex = 0;

     

       215
       215
       +
       		let segmentInPara = 1;

     

       216
       216
       +
       		for (let p = 0; p < paragraphBoundaries.length; p++) {

     

       217
       217
       +
       			const para = paragraphBoundaries[p];

     

       218
       218
       +
       			if (i >= para.startSegmentIndex && i <= para.endSegmentIndex) {

     

       219
       219
       +
       				paraIndex = p + 1;

     

       220
       220
       +
       				segmentInPara = i - para.startSegmentIndex + 1;

     

       221
       221
       +
       				break;

     

       222
       222
       +
       			}

     

       223
       223
       +
       		}

     

       224
       224
       +
       

     

       225
       225
       +
       		// Use paragraph-based ID: "Paragraph N-M" where N is paragraph number, M is segment within paragraph

     

       226
       226
       +
       		seg.index = `Paragraph ${paraIndex}-${segmentInPara}`;

     

       227
       227
       +
       	}

     

       228
       228
       +
       

     

       229
       229
       +
       	// Build output VTT with cleaned segment cues having paragraph-based IDs

     

       230
       230
       +
       	let output = "WEBVTT\n\n";

     

       231
       231
       +
       	for (const seg of cleanedSegments) {

     

       232
       232
       +
       		if (!seg || !seg.timestamp || !seg.text) continue;

     

       233
       233
       +
       		output += `${seg.index}\n`;

     

       234
       234
       +
       		output += `${seg.timestamp}\n`;

     

       235
       235
       +
       		output += `${seg.text}\n\n`;

     

       236
       236
       +
       	}

     

       237
       237
       +
       

     

       238
       238
       +
       	console.log(

     

       239
       239
       +
       		`[VTTCleaner] Completed for ${transcriptionId}: ${cleanedSegments.length} segments in ${paragraphBoundaries.length} paragraphs`,

     

       240
       240
       +
       	);

     

       123
       241
        
       

     

       124
       242
        
       	return output;

     

       125
       243
        
       }