🪻 distributed transcription service thistle.dunkirk.sh

chore: ai based cleaner

dunkirk.sh 2c25d3a2 a88b2813

verified
+72 -17
src/components/transcription.ts
···
import { css, html, LitElement } from "lit";
import { customElement, state } from "lit/decorators.js";
interface TranscriptionJob {
id: string;
···
created_at: number;
audioUrl?: string;
vttSegments?: VTTSegment[];
}
interface VTTSegment {
start: number;
end: number;
text: string;
}
···
let i = 0;
// Skip WEBVTT header
-
while (i < lines.length && !lines[i]?.includes("-->")) {
i++;
}
while (i < lines.length) {
-
const line = lines[i];
-
if (line?.includes("-->")) {
-
const [startStr, endStr] = line.split("-->").map((s) => s.trim());
const start = parseVTTTimestamp(startStr || "");
const end = parseVTTTimestamp(endStr || "");
···
start,
end,
text: textLines.join(" ").trim(),
});
}
-
i++;
}
return segments;
···
border-radius: 2px;
}
.audio-player {
margin-top: 1rem;
width: 100%;
···
if (response.ok) {
const vttContent = await response.text();
const segments = parseVTT(vttContent);
-
-
// Update job with VTT segments
const job = this.jobs.find((j) => j.id === jobId);
if (job) {
job.vttSegments = segments;
job.audioUrl = `/api/transcriptions/${jobId}/audio`;
this.jobs = [...this.jobs];
···
"audio/wav", // WAV
"audio/x-wav", // WAV (alternative)
"audio/m4a", // M4A
"audio/mp4", // MP4 audio
"audio/aac", // AAC
"audio/ogg", // OGG
···
}
private renderTranscript(job: TranscriptionJob) {
-
if (!job.vttSegments) {
const displayed = this.displayedTranscripts.get(job.id) || "";
return displayed;
}
-
const segments = job.vttSegments;
-
// Render segments as clickable spans
-
return html`${segments.map(
-
(segment, idx) => html`<span
-
class="segment"
-
data-start="${segment.start}"
-
data-end="${segment.end}"
-
>${segment.text}</span>${idx < segments.length - 1 ? " " : ""}`,
-
)}`;
}
···
import { css, html, LitElement } from "lit";
import { customElement, state } from "lit/decorators.js";
+
import { parseVTT } from "../lib/vtt-cleaner";
interface TranscriptionJob {
id: string;
···
created_at: number;
audioUrl?: string;
vttSegments?: VTTSegment[];
+
vttContent?: string;
}
interface VTTSegment {
start: number;
end: number;
text: string;
+
index?: string;
}
···
let i = 0;
// Skip WEBVTT header
+
while (i < lines.length && lines[i]?.trim() !== "WEBVTT") {
i++;
}
+
i++; // Skip WEBVTT
while (i < lines.length) {
+
let index: string | undefined;
+
// Check for cue ID (line before timestamp)
+
if (lines[i]?.trim() && !lines[i]?.includes("-->")) {
+
index = lines[i]?.trim();
+
i++;
+
}
+
+
if (i < lines.length && lines[i]?.includes("-->")) {
+
const [startStr, endStr] = lines[i].split("-->").map((s) => s.trim());
const start = parseVTTTimestamp(startStr || "");
const end = parseVTTTimestamp(endStr || "");
···
start,
end,
text: textLines.join(" ").trim(),
+
index,
});
+
} else {
+
i++;
}
}
return segments;
···
border-radius: 2px;
}
+
.paragraph {
+
display: block;
+
margin: 0 0 1rem 0;
+
line-height: 1.6;
+
}
+
.audio-player {
margin-top: 1rem;
width: 100%;
···
if (response.ok) {
const vttContent = await response.text();
const segments = parseVTT(vttContent);
+
+
// Update job with VTT content and segments
const job = this.jobs.find((j) => j.id === jobId);
if (job) {
+
job.vttContent = vttContent;
job.vttSegments = segments;
job.audioUrl = `/api/transcriptions/${jobId}/audio`;
this.jobs = [...this.jobs];
···
"audio/wav", // WAV
"audio/x-wav", // WAV (alternative)
"audio/m4a", // M4A
+
"audio/x-m4a", // M4A (alternative)
"audio/mp4", // MP4 audio
"audio/aac", // AAC
"audio/ogg", // OGG
···
}
private renderTranscript(job: TranscriptionJob) {
+
if (!job.vttContent) {
const displayed = this.displayedTranscripts.get(job.id) || "";
return displayed;
}
+
const segments = parseVTT(job.vttContent);
+
// Group segments by paragraph (extract paragraph number from ID like "Paragraph 1-1" -> "1")
+
const paragraphGroups = new Map<string, typeof segments>();
+
for (const segment of segments) {
+
const id = (segment.index || '').trim();
+
const match = id.match(/^Paragraph\s+(\d+)-/);
+
const paraNum = match ? match[1] : '0';
+
if (!paragraphGroups.has(paraNum)) {
+
paragraphGroups.set(paraNum, []);
+
}
+
paragraphGroups.get(paraNum)!.push(segment);
+
}
+
+
// Render each paragraph group
+
const paragraphs = Array.from(paragraphGroups.entries()).map(([paraNum, groupSegments]) => {
+
// Concatenate all text in the group
+
const fullText = groupSegments.map(s => s.text || '').join(' ');
+
// Split into sentences
+
const sentences = fullText.split(/(?<=[\.\!\?])\s+/g).filter(Boolean);
+
// Calculate word counts for timing
+
const wordCounts = sentences.map((s) => s.split(/\s+/).filter(Boolean).length);
+
const totalWords = Math.max(1, wordCounts.reduce((a, b) => a + b, 0));
+
+
// Overall paragraph timing
+
const paraStart = Math.min(...groupSegments.map(s => s.start ?? 0));
+
const paraEnd = Math.max(...groupSegments.map(s => s.end ?? paraStart));
+
+
let acc = 0;
+
const paraDuration = paraEnd - paraStart;
+
+
return html`<div class="paragraph">
+
${sentences.map((sent, si) => {
+
const startOffset = (acc / totalWords) * paraDuration;
+
acc += wordCounts[si];
+
const sentenceDuration = (wordCounts[si] / totalWords) * paraDuration;
+
const endOffset = si < sentences.length - 1 ? startOffset + sentenceDuration - 0.001 : paraEnd - paraStart;
+
const spanStart = paraStart + startOffset;
+
const spanEnd = paraStart + endOffset;
+
return html`<span class="segment" data-start="${spanStart}" data-end="${spanEnd}">${sent}</span>${si < sentences.length - 1 ? ' ' : ''}`;
+
})}
+
</div>`;
+
});
+
+
return html`${paragraphs}`;
}
+78 -37
src/lib/transcript-cleaner.test.ts
···
import { test, expect } from "bun:test";
-
import { cleanTranscript } from "./transcript-cleaner";
-
test("cleanTranscript removes tags and fixes grammar", async () => {
-
const rawTranscript = `[SIDE CONVERSATION] Yes? So with this course packet, what quiz is and exams, and if I can study through here, what you talk about? And I give you a good review every time. Yeah, so I'd be good to just study that and then we can do it. Yeah, and all the examples and stuff that we get from class especially. And then I, like your first quiz, I give you a mock quiz exactly like the quiz. Oh, okay. so you can kind of get a feel for how I do things. [inaudible] Okay? [inaudible] Yeah. [background chatter]`;
-
const result = await cleanTranscript({
-
transcriptId: "test-123",
-
rawTranscript,
-
});
-
// Check that tags are removed
-
expect(result.cleanedTranscript).not.toContain("[SIDE CONVERSATION]");
-
expect(result.cleanedTranscript).not.toContain("[inaudible]");
-
expect(result.cleanedTranscript).not.toContain("[background chatter]");
-
// Check that we got some text back
-
expect(result.cleanedTranscript.length).toBeGreaterThan(0);
-
expect(result.cleanedTranscript.length).toBeLessThan(rawTranscript.length);
-
console.log("Original:", rawTranscript.substring(0, 100));
-
console.log("Cleaned:", result.cleanedTranscript.substring(0, 100));
}, 30000); // 30s timeout for API call
-
test("cleanTranscript handles empty transcript", async () => {
-
const result = await cleanTranscript({
-
transcriptId: "test-empty",
-
rawTranscript: "",
-
});
-
expect(result.cleanedTranscript).toBe("");
});
-
test("cleanTranscript falls back to raw transcript on API error", async () => {
-
const rawTranscript = "Test transcript";
-
// Test with missing API key (if it's actually set, this test might fail)
-
const originalKey = process.env.GEMINI_API_KEY;
-
delete process.env.GEMINI_API_KEY;
-
const result = await cleanTranscript({
-
transcriptId: "test-fallback",
-
rawTranscript,
-
});
-
expect(result.cleanedTranscript).toBe(rawTranscript);
-
expect(result.error).toBe("GEMINI_API_KEY not set");
-
// Restore key
-
if (originalKey) {
-
process.env.GEMINI_API_KEY = originalKey;
-
}
});
···
import { test, expect } from "bun:test";
+
import { cleanAndGetParagraphBoundaries } from "./transcript-cleaner";
+
+
test("cleanAndGetParagraphBoundaries cleans transcript and returns paragraph boundaries", async () => {
+
// Use a longer, more realistic transcript sample with natural paragraph breaks
+
const rawTranscript = `[SIDE CONVERSATION] Today in chapel we are talking about the fact that we believe in having gospel conversations. I'm gonna run my own PowerPoint. I'm gonna jump around. It's gonna be a little more conversational than normal. It's not gonna be like one of the normal sermons, although I know me and my tendency it'll turn into a sermon at some point just because that's the way God made me, so I can't help it.
+
+
Alright, so when it starts just have fun with it. We'll go on. Here's what it says in our doctrinal statement. It says, "Due to the commission of Christ and the urgency of the Gospel, all believers are to engage in Gospel conversations." How many of you believe that? That's pretty weak. How many of you believe that?
+
To live God-honoring lives and to work continuously for the spread of the Gospel to their neighbors and the nations. Now, let's be honest, as we start off this morning, all of us could do a better job with personal evangelism, and all of us could do a better job with a heart for missions.
+
So I'm not up here talking to you about something I have conquered or mastered. I'm not the expert on this. In fact, when it comes to personal evangelism in my own strength, I'm often a complete failure. But I have found that even in my weakness, God can use me in powerful ways when I make myself available to Him.`;
+
// Create mock segments from raw transcript (simulating whisper output)
+
const sentences = rawTranscript.split(/\.\s+/);
+
const mockSegments: { index?: number; start?: number; end?: number; text: string }[] = [];
+
let timeOffset = 0;
+
for (let i = 0; i < sentences.length; i++) {
+
const sentence = sentences[i]?.trim();
+
if (!sentence) continue;
+
const duration = sentence.split(/\s+/).length * 0.3; // ~0.3s per word
+
mockSegments.push({
+
index: i,
+
start: timeOffset,
+
end: timeOffset + duration,
+
text: sentence,
+
});
+
timeOffset += duration;
+
}
+
+
const result = await cleanAndGetParagraphBoundaries({
+
transcriptId: "test-123",
+
rawTranscript,
+
segments: mockSegments,
+
maxWordsMove: 3,
+
});
+
+
// Check that we got a result
+
expect(result.paragraphs).toBeDefined();
+
expect(result.paragraphs!.length).toBeGreaterThan(1); // Should have multiple paragraphs
+
+
// Check that paragraphs have the expected structure
+
for (const para of result.paragraphs!) {
+
expect(para).toHaveProperty('startSegmentIndex');
+
expect(para).toHaveProperty('endSegmentIndex');
+
expect(para).toHaveProperty('text');
+
expect(para.text.length).toBeGreaterThan(0);
+
}
+
+
// The cleaned text should have tags removed
+
const cleanedText = result.paragraphs!.map(p => p.text).join(' ');
+
expect(cleanedText).not.toContain("[SIDE CONVERSATION]");
+
expect(cleanedText.toLowerCase()).toContain("gospel");
+
expect(cleanedText.toLowerCase()).toContain("evangelism");
+
console.log(`Detected ${result.paragraphs!.length} paragraphs from ${mockSegments.length} segments`);
+
console.log("First paragraph:", result.paragraphs![0]?.text.substring(0, 100) + "...");
+
console.log("Last paragraph:", result.paragraphs![result.paragraphs!.length - 1]?.text.substring(0, 100) + "...");
}, 30000); // 30s timeout for API call
+
test("cleanAndGetParagraphBoundaries handles empty transcript", async () => {
+
const result = await cleanAndGetParagraphBoundaries({
+
transcriptId: "test-empty",
+
rawTranscript: "",
+
segments: [],
+
maxWordsMove: 3,
+
});
+
expect(result.paragraphs).toEqual([]);
});
+
test("cleanAndGetParagraphBoundaries returns error on missing API key", async () => {
+
const rawTranscript = "Test transcript";
+
// Test with missing API key (if it's actually set, this test might fail)
+
const originalKey = process.env.OPENROUTER_API_KEY;
+
delete process.env.OPENROUTER_API_KEY;
+
const result = await cleanAndGetParagraphBoundaries({
+
transcriptId: "test-fallback",
+
rawTranscript,
+
segments: [{ text: rawTranscript }],
+
maxWordsMove: 3,
+
});
+
expect(result.paragraphs).toBeUndefined();
+
expect(result.error).toBe("OPENROUTER_API_KEY not set");
+
// Restore key
+
if (originalKey) {
+
process.env.OPENROUTER_API_KEY = originalKey;
+
}
});
+91 -84
src/lib/transcript-cleaner.ts
···
-
// Clean up transcripts using Gemini to remove tags and fix grammar
-
-
interface CleanTranscriptOptions {
-
transcriptId: string;
-
rawTranscript: string;
-
}
-
-
interface CleanTranscriptResult {
-
cleanedTranscript: string;
-
error?: string;
}
-
/**
-
* Clean transcript using Gemini Flash 2.0 (cheapest model)
-
* Removes tags like [SIDE CONVERSATION], [inaudible], etc.
-
* Fixes grammar while preserving sentence structure
-
*/
-
export async function cleanTranscript({
transcriptId,
rawTranscript,
-
}: CleanTranscriptOptions): Promise<CleanTranscriptResult> {
-
const apiKey = process.env.GEMINI_API_KEY;
if (!apiKey) {
-
return {
-
cleanedTranscript: rawTranscript,
-
error: "GEMINI_API_KEY not set",
-
};
}
-
// Skip cleaning if transcript is empty
-
if (!rawTranscript || rawTranscript.trim().length === 0) {
-
return {
-
cleanedTranscript: rawTranscript,
-
};
-
}
-
console.log(
-
`[TranscriptCleaner] Starting cleanup for ${transcriptId} (${rawTranscript.length} chars)`,
-
);
-
try {
-
const prompt = `You are a transcript editor. Clean up this transcript by:
1. Removing ALL tags like [SIDE CONVERSATION], [inaudible], [background chatter], etc.
2. Fixing grammar and punctuation to make sentences readable
3. Preserving the original sentence structure and wording as much as possible
···
5. NOT adding any new content or changing the meaning
6. If there are obvious speaking mistakes then you can fix those (e.g. "we are going no wait sorry you should be doing")
-
Return ONLY the cleaned transcript text, nothing else.
-
Transcript to clean:
${rawTranscript}`;
const response = await fetch(
-
"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-exp:generateContent",
{
method: "POST",
headers: {
"Content-Type": "application/json",
-
"x-goog-api-key": apiKey,
},
body: JSON.stringify({
-
contents: [
-
{
-
parts: [{ text: prompt }],
-
},
],
-
generationConfig: {
-
temperature: 0.3,
-
topK: 40,
-
topP: 0.95,
-
maxOutputTokens: 8192,
-
},
}),
},
);
if (!response.ok) {
const errorText = await response.text();
-
console.error(
-
`[TranscriptCleaner] Gemini API error for ${transcriptId}:`,
-
errorText,
-
);
-
return {
-
cleanedTranscript: rawTranscript,
-
error: `Gemini API error: ${response.status}`,
-
};
}
const result = await response.json();
-
const cleanedText =
-
result.candidates?.[0]?.content?.parts?.[0]?.text?.trim();
-
if (!cleanedText) {
-
console.warn(
-
`[TranscriptCleaner] Empty response from Gemini for ${transcriptId}`,
-
);
-
return {
-
cleanedTranscript: rawTranscript,
-
error: "Empty response from Gemini",
-
};
}
-
const reduction = Math.round(
-
((rawTranscript.length - cleanedText.length) / rawTranscript.length) *
-
100,
-
);
-
console.log(
-
`[TranscriptCleaner] Completed for ${transcriptId}: ${rawTranscript.length} → ${cleanedText.length} chars (${reduction}% reduction)`,
-
);
-
return {
-
cleanedTranscript: cleanedText,
-
};
-
} catch (error) {
-
console.error(
-
`[TranscriptCleaner] Failed to clean ${transcriptId}:`,
-
error,
-
);
-
return {
-
cleanedTranscript: rawTranscript,
-
error: error instanceof Error ? error.message : "Unknown error",
-
};
}
}
···
+
// Paragraph boundary detection using OpenRouter. Returns a JSON array of paragraph objects.
+
export interface ParagraphBoundary {
+
startSegmentIndex: number;
+
endSegmentIndex: number;
+
text: string;
+
// Optional: list of moved words for auditing
+
movedWords?: { word: string; fromSegmentIndex: number; toSegmentIndex: number }[];
}
+
// Cleans transcript and determines paragraph boundaries in one LLM request.
+
// Returns paragraph boundaries as JSON array.
+
export async function cleanAndGetParagraphBoundaries({
transcriptId,
rawTranscript,
+
segments,
+
maxWordsMove = 0,
+
}: {
+
transcriptId: string;
+
rawTranscript: string;
+
segments: { index?: number; start?: number; end?: number; text: string }[];
+
maxWordsMove?: number;
+
}): Promise<{ paragraphs?: ParagraphBoundary[]; error?: string }> {
+
// Skip processing if transcript is empty
+
if (!rawTranscript || rawTranscript.trim().length === 0) {
+
return { paragraphs: [] };
+
}
+
const apiKey = process.env.OPENROUTER_API_KEY;
+
const model = process.env.OPENROUTER_MODEL || "openrouter/polaris-alpha";
if (!apiKey) {
+
return { error: "OPENROUTER_API_KEY not set" };
}
+
try {
+
const segmentsPayload = segments.map((s) => ({
+
index: s.index ?? null,
+
start: s.start ?? null,
+
end: s.end ?? null,
+
text: s.text ?? "",
+
}));
+
const prompt = `You are a transcript editor and paragrapher. Input: a list of original transcript segments with their index, start time (seconds), end time (seconds), and the RAW transcript text.
+
Your task: First, clean the transcript by:
1. Removing ALL tags like [SIDE CONVERSATION], [inaudible], [background chatter], etc.
2. Fixing grammar and punctuation to make sentences readable
3. Preserving the original sentence structure and wording as much as possible
···
5. NOT adding any new content or changing the meaning
6. If there are obvious speaking mistakes then you can fix those (e.g. "we are going no wait sorry you should be doing")
+
Then, determine paragraph boundaries by grouping the cleaned segments into logical paragraphs. A paragraph represents a complete thought, topic, or idea. Create MULTIPLE paragraphs based on:
+
- Natural topic changes or shifts in the speaker's focus
+
- Pauses or transitions in the speech ("Now...", "So...", "Let me tell you...", "Alright...")
+
- Complete narrative beats or examples
+
- Typical spoken paragraph length (30-120 seconds / 5-20 segments)
+
+
CRITICAL: Each paragraph MUST end with a complete sentence. DO NOT break paragraphs mid-sentence.
+
+
RETURN ONLY a JSON array of objects, EXACTLY in this format (no additional text):
+
+
[ {"startSegmentIndex": <int>, "endSegmentIndex": <int>, "text": "<paragraph text>"}, ... ]
+
+
Rules for paragraphing:
+
- ALWAYS end paragraphs at sentence boundaries (after periods, question marks, or exclamation points)
+
- NEVER break a paragraph in the middle of a sentence
+
- Create AT LEAST one paragraph for every 30-60 seconds of speech (roughly 5-10 segments)
+
- DO NOT put the entire transcript in a single paragraph
+
- Paragraphs must reference original segment indexes
+
- Do not move words across segment boundaries
+
- Return the paragraphs in order and cover the entire cleaned transcript text without overlap or omission
+
+
Segments:
+
${JSON.stringify(segmentsPayload, null, 2)}
+
Raw Transcript:
${rawTranscript}`;
const response = await fetch(
+
"https://openrouter.ai/api/v1/chat/completions",
{
method: "POST",
headers: {
"Content-Type": "application/json",
+
"Authorization": `Bearer ${apiKey}`,
+
"HTTP-Referer": "https://thistle.app",
+
"X-Title": "Thistle Transcription",
},
body: JSON.stringify({
+
model,
+
messages: [
+
{ role: "user", content: prompt },
],
+
temperature: 0.0,
+
max_tokens: 8192,
}),
},
);
if (!response.ok) {
const errorText = await response.text();
+
console.error(`[Paragrapher] OpenRouter error for ${transcriptId}:`, errorText);
+
return { error: `OpenRouter API error: ${response.status}` };
}
const result = await response.json();
+
const raw = result.choices?.[0]?.message?.content?.trim();
+
if (!raw) {
+
return { error: "Empty paragrapher response" };
+
}
+
let parsed: ParagraphBoundary[] | null = null;
+
try {
+
parsed = JSON.parse(raw) as ParagraphBoundary[];
+
} catch (e) {
+
// Attempt to extract JSON substring if model padded text
+
const firstBracket = raw.indexOf("[");
+
const lastBracket = raw.lastIndexOf("]");
+
if (firstBracket >= 0 && lastBracket > firstBracket) {
+
const substr = raw.substring(firstBracket, lastBracket + 1);
+
parsed = JSON.parse(substr) as ParagraphBoundary[];
+
}
}
+
if (!parsed || !Array.isArray(parsed)) {
+
return { error: "Failed to parse paragrapher JSON" };
+
}
+
return { paragraphs: parsed };
+
} catch (err) {
+
console.error("[Paragrapher] Exception:", err);
+
return { error: err instanceof Error ? err.message : "Unknown error" };
}
}
+13 -8
src/lib/transcription.ts
···
import { ErrorCode } from "./errors";
import { saveTranscriptVTT } from "./transcript-storage";
import { cleanVTT } from "./vtt-cleaner";
// Constants
export const MAX_FILE_SIZE = 100 * 1024 * 1024; // 100MB
···
`${this.serviceUrl}/transcribe/${whisperJobId}?format=vtt`,
);
if (vttResponse.ok) {
-
const vttContent = await vttResponse.text();
-
const cleanedVTT = await cleanVTT(transcriptionId, vttContent);
-
await saveTranscriptVTT(transcriptionId, cleanedVTT);
-
}
} catch (error) {
console.warn(
`[Transcription] Failed to fetch VTT for ${transcriptionId}:`,
···
status?: TranscriptionStatus;
progress?: number;
error_message?: string;
},
) {
const updates: string[] = [];
···
updates.push("error_message = ?");
values.push(data.error_message);
}
updates.push("updated_at = ?");
values.push(Math.floor(Date.now() / 1000));
···
`${this.serviceUrl}/transcribe/${whisperJob.id}?format=vtt`,
);
if (vttResponse.ok) {
-
const vttContent = await vttResponse.text();
-
const cleanedVTT = await cleanVTT(transcriptionId, vttContent);
-
await saveTranscriptVTT(transcriptionId, cleanedVTT);
-
}
} catch (error) {
console.warn(
`[Sync] Failed to fetch VTT for ${transcriptionId}:`,
···
import { ErrorCode } from "./errors";
import { saveTranscriptVTT } from "./transcript-storage";
import { cleanVTT } from "./vtt-cleaner";
+
import { parseVTT } from "./vtt-cleaner";
// Constants
export const MAX_FILE_SIZE = 100 * 1024 * 1024; // 100MB
···
`${this.serviceUrl}/transcribe/${whisperJobId}?format=vtt`,
);
if (vttResponse.ok) {
+
const vttContent = await vttResponse.text();
+
const cleanedVTT = await cleanVTT(transcriptionId, vttContent);
+
await saveTranscriptVTT(transcriptionId, cleanedVTT);
+
this.updateTranscription(transcriptionId, {});
+
}
} catch (error) {
console.warn(
`[Transcription] Failed to fetch VTT for ${transcriptionId}:`,
···
status?: TranscriptionStatus;
progress?: number;
error_message?: string;
+
vttContent?: string;
},
) {
const updates: string[] = [];
···
updates.push("error_message = ?");
values.push(data.error_message);
}
+
updates.push("updated_at = ?");
values.push(Math.floor(Date.now() / 1000));
···
`${this.serviceUrl}/transcribe/${whisperJob.id}?format=vtt`,
);
if (vttResponse.ok) {
+
const vttContent = await vttResponse.text();
+
const cleanedVTT = await cleanVTT(transcriptionId, vttContent);
+
await saveTranscriptVTT(transcriptionId, cleanedVTT);
+
this.updateTranscription(transcriptionId, {});
+
}
} catch (error) {
console.warn(
`[Sync] Failed to fetch VTT for ${transcriptionId}:`,
+44
src/lib/vtt-cleaner.test.ts
···
import { test, expect } from "bun:test";
import { cleanVTT } from "./vtt-cleaner";
const sampleVTT = `WEBVTT
···
expect(result).toBe(emptyVTT);
});
···
import { test, expect } from "bun:test";
import { cleanVTT } from "./vtt-cleaner";
+
import { readFileSync } from "fs";
+
import { join } from "path";
const sampleVTT = `WEBVTT
···
expect(result).toBe(emptyVTT);
});
+
+
test("cleanVTT detects multiple paragraphs", async () => {
+
const multiParaVTT = `WEBVTT
+
+
Paragraph 1-1
+
00:00:00.000 --> 00:00:00.000
+
Again, thank you for the privilege to not only study here, but also to teach here. Jesus,
+
+
Paragraph 1-2
+
00:00:00.000 --> 00:00:00.000
+
thank you. All`;
+
+
const result = await cleanVTT("test-multi-para", multiParaVTT);
+
+
expect(result).toContain("Paragraph 1-1");
+
expect(result).toContain("Paragraph 2-1");
+
// Should have at least two paragraphs
+
const paraMatches = result.match(/Paragraph \d+-\d+/g);
+
expect(paraMatches?.length).toBeGreaterThan(1);
+
}, 30000);
+
+
test("cleanVTT with real transcription data", async () => {
+
const originalApiKey = process.env.OPENROUTER_API_KEY;
+
// Temporarily unset to force fallback
+
delete process.env.OPENROUTER_API_KEY;
+
+
try {
+
const vttPath = join(__dirname, "../../transcripts/d69d8076-598a-4fe5-8100-fe3eff47fcd6.vtt");
+
const realVTT = readFileSync(vttPath, "utf-8");
+
+
const result = await cleanVTT("real-test", realVTT);
+
+
expect(result).toContain("WEBVTT");
+
// Check that it has multiple paragraph numbers
+
const paraMatches = result.match(/Paragraph (\d+)-\d+/g);
+
const uniqueParas = new Set(paraMatches?.map(m => m.match(/Paragraph (\d+)/)?.[1]));
+
expect(uniqueParas.size).toBeGreaterThan(1);
+
console.log("Paragraphs found:", uniqueParas.size);
+
} finally {
+
process.env.OPENROUTER_API_KEY = originalApiKey;
+
}
+
}, 30000);
+147 -29
src/lib/vtt-cleaner.ts
···
// Parse and clean VTT files
-
import { cleanTranscript } from "./transcript-cleaner";
interface VTTSegment {
index?: number;
timestamp: string;
text: string;
}
/**
-
* Parse VTT content into segments
*/
-
function parseVTT(vttContent: string): VTTSegment[] {
const lines = vttContent.split("\n");
const segments: VTTSegment[] = [];
let currentSegment: Partial<VTTSegment> = {};
···
if (!line) {
if (currentSegment.timestamp && currentSegment.text) {
segments.push(currentSegment as VTTSegment);
currentSegment = {};
}
···
continue;
}
// Check if it's a timestamp line
if (line.includes("-->")) {
currentSegment.timestamp = line;
// Next line(s) will be text
const textLines: string[] = [];
i++;
-
while (i < lines.length && lines[i]?.trim() && !lines[i]?.includes("-->")) {
textLines.push(lines[i] || "");
i++;
}
···
// Add last segment if exists
if (currentSegment.timestamp && currentSegment.text) {
segments.push(currentSegment as VTTSegment);
}
···
}
/**
-
* Clean VTT text segments by removing tags and fixing grammar
*/
export async function cleanVTT(
transcriptionId: string,
···
}
console.log(
-
`[VTTCleaner] Cleaning ${segments.length} segments for ${transcriptionId}`,
);
-
// Combine all text for cleaning
const allText = segments.map((s) => s.text).join(" ");
-
const { cleanedTranscript, error } = await cleanTranscript({
-
transcriptId: transcriptionId,
-
rawTranscript: allText,
-
});
-
if (error) {
-
console.warn(`[VTTCleaner] Falling back to original VTT: ${error}`);
-
return vttContent;
}
-
// Split cleaned text back into segments
-
// Use simple word-based splitting proportional to original segment lengths
-
const words = cleanedTranscript.split(/\s+/);
-
const originalWords = allText.split(/\s+/);
-
const ratio = words.length / originalWords.length;
let wordIndex = 0;
const cleanedSegments: VTTSegment[] = [];
for (const segment of segments) {
-
const originalWordCount = segment.text.split(/\s+/).length;
const newWordCount = Math.max(1, Math.round(originalWordCount * ratio));
const segmentWords = words.slice(wordIndex, wordIndex + newWordCount);
wordIndex += newWordCount;
cleanedSegments.push({
timestamp: segment.timestamp,
text: segmentWords.join(" "),
-
index: segment.index,
});
}
-
// Rebuild VTT
-
let output = "WEBVTT\n\n";
-
for (const segment of cleanedSegments) {
-
if (segment.index !== undefined) {
-
output += `${segment.index}\n`;
}
-
output += `${segment.timestamp}\n`;
-
output += `${segment.text}\n\n`;
}
-
console.log(`[VTTCleaner] Completed for ${transcriptionId}`);
return output;
}
···
// Parse and clean VTT files
+
import type { ParagraphBoundary } from "./transcript-cleaner";
interface VTTSegment {
index?: number;
timestamp: string;
text: string;
+
start?: number;
+
end?: number;
}
/**
+
* Parse a VTT timestamp string (hh:mm:ss.mmm or mm:ss.mmm) into seconds
*/
+
function parseTimestampToSeconds(ts?: string): number {
+
if (!ts) return 0;
+
// ts expected like "00:00:09.039"
+
const parts = ts.split(":").map((p) => p.trim());
+
const hh = parts[0] ?? "0";
+
const mm = parts[1] ?? "0";
+
const ss = parts[2] ?? "0";
+
if (parts.length === 3) {
+
const seconds =
+
parseInt(hh, 10) * 3600 + parseInt(mm, 10) * 60 + parseFloat(ss);
+
return seconds;
+
} else if (parts.length === 2) {
+
return parseInt(mm, 10) * 60 + parseFloat(ss);
+
}
+
return 0;
+
}
+
+
/**
+
* Parse VTT content into segments, populating start/end in seconds
+
*/
+
export function parseVTT(vttContent: string): VTTSegment[] {
const lines = vttContent.split("\n");
const segments: VTTSegment[] = [];
let currentSegment: Partial<VTTSegment> = {};
···
if (!line) {
if (currentSegment.timestamp && currentSegment.text) {
+
// parse start/end
+
const match = /([\d:.]+)\s*-->\s*([\d:.]+)/.exec(
+
currentSegment.timestamp || "",
+
);
+
if (match) {
+
currentSegment.start = parseTimestampToSeconds(match[1]);
+
currentSegment.end = parseTimestampToSeconds(match[2]);
+
}
segments.push(currentSegment as VTTSegment);
currentSegment = {};
}
···
continue;
}
+
// Check if it's a cue id (before timestamp)
+
if (!currentSegment.timestamp && line && !line.includes("-->")) {
+
currentSegment.index = line;
+
continue;
+
}
+
// Check if it's a timestamp line
if (line.includes("-->")) {
currentSegment.timestamp = line;
// Next line(s) will be text
const textLines: string[] = [];
i++;
+
while (
+
i < lines.length &&
+
lines[i]?.trim() &&
+
!lines[i]?.includes("-->")
+
) {
textLines.push(lines[i] || "");
i++;
}
···
// Add last segment if exists
if (currentSegment.timestamp && currentSegment.text) {
+
const match = /([\d:.]+)\s*-->\s*([\d:.]+)/.exec(
+
currentSegment.timestamp || "",
+
);
+
if (match?.[1] && match[2]) {
+
currentSegment.start = parseTimestampToSeconds(match[1]);
+
currentSegment.end = parseTimestampToSeconds(match[2]);
+
}
segments.push(currentSegment as VTTSegment);
}
···
}
/**
+
* Clean VTT text segments by removing tags and fixing grammar.
+
* Additionally, merge cleaned segments into paragraph cues while preserving
+
* stable paragraph IDs (derived from first segment start time).
*/
export async function cleanVTT(
transcriptionId: string,
···
}
console.log(
+
`[VTTCleaner] Processing ${segments.length} segments for ${transcriptionId}`,
);
+
// Combine all text for cleaning and paragraphing
const allText = segments.map((s) => s.text).join(" ");
+
// Attempt LLM-driven cleaning and paragraphing in one request, fallback to deterministic rules
+
let paragraphBoundaries: ParagraphBoundary[] = [];
+
try {
+
const { cleanAndGetParagraphBoundaries } = await import(
+
"./transcript-cleaner"
+
);
+
const result = await cleanAndGetParagraphBoundaries({
+
transcriptId: transcriptionId,
+
rawTranscript: allText,
+
segments: segments.map((s) => ({
+
index: s.index,
+
start: s.start,
+
end: s.end,
+
text: s.text,
+
})),
+
maxWordsMove: 0,
+
});
+
+
if (result?.paragraphs) {
+
paragraphBoundaries = result.paragraphs;
+
}
+
} catch (e) {
+
console.warn(
+
"[VTTCleaner] Consolidated LLM failed, no paragraph detection:",
+
e,
+
);
}
+
if (paragraphBoundaries.length === 0) {
+
// No paragraphs detected, treat as one big paragraph
+
paragraphBoundaries = [
+
{
+
startSegmentIndex: 0,
+
endSegmentIndex: segments.length - 1,
+
text: allText,
+
},
+
];
+
}
+
+
// Get the full cleaned transcript from paragraphs
+
const cleanedTranscript = paragraphBoundaries.map((p) => p.text).join(" ");
+
+
// Split cleaned text back into segments proportionally (word-based)
+
const words = cleanedTranscript.split(/\s+/).filter(Boolean);
+
const originalWords = allText.split(/\s+/).filter(Boolean);
+
const ratio = words.length / Math.max(1, originalWords.length);
let wordIndex = 0;
const cleanedSegments: VTTSegment[] = [];
for (const segment of segments) {
+
const originalWordCount = Math.max(
+
1,
+
segment.text.split(/\s+/).filter(Boolean).length,
+
);
const newWordCount = Math.max(1, Math.round(originalWordCount * ratio));
const segmentWords = words.slice(wordIndex, wordIndex + newWordCount);
wordIndex += newWordCount;
cleanedSegments.push({
+
index: segment.index,
timestamp: segment.timestamp,
text: segmentWords.join(" "),
+
start: segment.start,
+
end: segment.end,
});
}
+
// If any remaining words, append to last segment
+
if (wordIndex < words.length && cleanedSegments.length > 0) {
+
const rest = words.slice(wordIndex).join(" ");
+
const lastIdx = cleanedSegments.length - 1;
+
const lastSeg = cleanedSegments[lastIdx];
+
if (lastSeg) {
+
lastSeg.text += (lastSeg.text ? " " : "") + rest;
}
}
+
// Assign paragraph-based IDs to segments
+
for (let i = 0; i < cleanedSegments.length; i++) {
+
const seg = cleanedSegments[i];
+
if (!seg) continue;
+
+
// Find which paragraph this segment belongs to
+
let paraIndex = 0;
+
let segmentInPara = 1;
+
for (let p = 0; p < paragraphBoundaries.length; p++) {
+
const para = paragraphBoundaries[p];
+
if (i >= para.startSegmentIndex && i <= para.endSegmentIndex) {
+
paraIndex = p + 1;
+
segmentInPara = i - para.startSegmentIndex + 1;
+
break;
+
}
+
}
+
+
// Use paragraph-based ID: "Paragraph N-M" where N is paragraph number, M is segment within paragraph
+
seg.index = `Paragraph ${paraIndex}-${segmentInPara}`;
+
}
+
+
// Build output VTT with cleaned segment cues having paragraph-based IDs
+
let output = "WEBVTT\n\n";
+
for (const seg of cleanedSegments) {
+
if (!seg || !seg.timestamp || !seg.text) continue;
+
output += `${seg.index}\n`;
+
output += `${seg.timestamp}\n`;
+
output += `${seg.text}\n\n`;
+
}
+
+
console.log(
+
`[VTTCleaner] Completed for ${transcriptionId}: ${cleanedSegments.length} segments in ${paragraphBoundaries.length} paragraphs`,
+
);
return output;
}