🪻 distributed transcription service thistle.dunkirk.sh

chore: ai based cleaner

dunkirk.sh 2c25d3a2 a88b2813

verified
+72 -17
src/components/transcription.ts
···
import { css, html, LitElement } from "lit";
import { customElement, state } from "lit/decorators.js";
+
import { parseVTT } from "../lib/vtt-cleaner";
interface TranscriptionJob {
id: string;
···
created_at: number;
audioUrl?: string;
vttSegments?: VTTSegment[];
+
vttContent?: string;
}
interface VTTSegment {
start: number;
end: number;
text: string;
+
index?: string;
}
···
let i = 0;
// Skip WEBVTT header
-
while (i < lines.length && !lines[i]?.includes("-->")) {
+
while (i < lines.length && lines[i]?.trim() !== "WEBVTT") {
i++;
}
+
i++; // Skip WEBVTT
while (i < lines.length) {
-
const line = lines[i];
-
if (line?.includes("-->")) {
-
const [startStr, endStr] = line.split("-->").map((s) => s.trim());
+
let index: string | undefined;
+
// Check for cue ID (line before timestamp)
+
if (lines[i]?.trim() && !lines[i]?.includes("-->")) {
+
index = lines[i]?.trim();
+
i++;
+
}
+
+
if (i < lines.length && lines[i]?.includes("-->")) {
+
const [startStr, endStr] = lines[i].split("-->").map((s) => s.trim());
const start = parseVTTTimestamp(startStr || "");
const end = parseVTTTimestamp(endStr || "");
···
start,
end,
text: textLines.join(" ").trim(),
+
index,
});
+
} else {
+
i++;
}
-
i++;
}
return segments;
···
border-radius: 2px;
}
+
.paragraph {
+
display: block;
+
margin: 0 0 1rem 0;
+
line-height: 1.6;
+
}
+
.audio-player {
margin-top: 1rem;
width: 100%;
···
if (response.ok) {
const vttContent = await response.text();
const segments = parseVTT(vttContent);
-
-
// Update job with VTT segments
+
+
// Update job with VTT content and segments
const job = this.jobs.find((j) => j.id === jobId);
if (job) {
+
job.vttContent = vttContent;
job.vttSegments = segments;
job.audioUrl = `/api/transcriptions/${jobId}/audio`;
this.jobs = [...this.jobs];
···
"audio/wav", // WAV
"audio/x-wav", // WAV (alternative)
"audio/m4a", // M4A
+
"audio/x-m4a", // M4A (alternative)
"audio/mp4", // MP4 audio
"audio/aac", // AAC
"audio/ogg", // OGG
···
}
private renderTranscript(job: TranscriptionJob) {
-
if (!job.vttSegments) {
+
if (!job.vttContent) {
const displayed = this.displayedTranscripts.get(job.id) || "";
return displayed;
}
-
const segments = job.vttSegments;
-
// Render segments as clickable spans
-
return html`${segments.map(
-
(segment, idx) => html`<span
-
class="segment"
-
data-start="${segment.start}"
-
data-end="${segment.end}"
-
>${segment.text}</span>${idx < segments.length - 1 ? " " : ""}`,
-
)}`;
+
const segments = parseVTT(job.vttContent);
+
// Group segments by paragraph (extract paragraph number from ID like "Paragraph 1-1" -> "1")
+
const paragraphGroups = new Map<string, typeof segments>();
+
for (const segment of segments) {
+
const id = (segment.index || '').trim();
+
const match = id.match(/^Paragraph\s+(\d+)-/);
+
const paraNum = match ? match[1] : '0';
+
if (!paragraphGroups.has(paraNum)) {
+
paragraphGroups.set(paraNum, []);
+
}
+
paragraphGroups.get(paraNum)!.push(segment);
+
}
+
+
// Render each paragraph group
+
const paragraphs = Array.from(paragraphGroups.entries()).map(([paraNum, groupSegments]) => {
+
// Concatenate all text in the group
+
const fullText = groupSegments.map(s => s.text || '').join(' ');
+
// Split into sentences
+
const sentences = fullText.split(/(?<=[\.\!\?])\s+/g).filter(Boolean);
+
// Calculate word counts for timing
+
const wordCounts = sentences.map((s) => s.split(/\s+/).filter(Boolean).length);
+
const totalWords = Math.max(1, wordCounts.reduce((a, b) => a + b, 0));
+
+
// Overall paragraph timing
+
const paraStart = Math.min(...groupSegments.map(s => s.start ?? 0));
+
const paraEnd = Math.max(...groupSegments.map(s => s.end ?? paraStart));
+
+
let acc = 0;
+
const paraDuration = paraEnd - paraStart;
+
+
return html`<div class="paragraph">
+
${sentences.map((sent, si) => {
+
const startOffset = (acc / totalWords) * paraDuration;
+
acc += wordCounts[si];
+
const sentenceDuration = (wordCounts[si] / totalWords) * paraDuration;
+
const endOffset = si < sentences.length - 1 ? startOffset + sentenceDuration - 0.001 : paraEnd - paraStart;
+
const spanStart = paraStart + startOffset;
+
const spanEnd = paraStart + endOffset;
+
return html`<span class="segment" data-start="${spanStart}" data-end="${spanEnd}">${sent}</span>${si < sentences.length - 1 ? ' ' : ''}`;
+
})}
+
</div>`;
+
});
+
+
return html`${paragraphs}`;
}
+78 -37
src/lib/transcript-cleaner.test.ts
···
import { test, expect } from "bun:test";
-
import { cleanTranscript } from "./transcript-cleaner";
+
import { cleanAndGetParagraphBoundaries } from "./transcript-cleaner";
+
+
test("cleanAndGetParagraphBoundaries cleans transcript and returns paragraph boundaries", async () => {
+
// Use a longer, more realistic transcript sample with natural paragraph breaks
+
const rawTranscript = `[SIDE CONVERSATION] Today in chapel we are talking about the fact that we believe in having gospel conversations. I'm gonna run my own PowerPoint. I'm gonna jump around. It's gonna be a little more conversational than normal. It's not gonna be like one of the normal sermons, although I know me and my tendency it'll turn into a sermon at some point just because that's the way God made me, so I can't help it.
+
+
Alright, so when it starts just have fun with it. We'll go on. Here's what it says in our doctrinal statement. It says, "Due to the commission of Christ and the urgency of the Gospel, all believers are to engage in Gospel conversations." How many of you believe that? That's pretty weak. How many of you believe that?
-
test("cleanTranscript removes tags and fixes grammar", async () => {
-
const rawTranscript = `[SIDE CONVERSATION] Yes? So with this course packet, what quiz is and exams, and if I can study through here, what you talk about? And I give you a good review every time. Yeah, so I'd be good to just study that and then we can do it. Yeah, and all the examples and stuff that we get from class especially. And then I, like your first quiz, I give you a mock quiz exactly like the quiz. Oh, okay. so you can kind of get a feel for how I do things. [inaudible] Okay? [inaudible] Yeah. [background chatter]`;
+
To live God-honoring lives and to work continuously for the spread of the Gospel to their neighbors and the nations. Now, let's be honest, as we start off this morning, all of us could do a better job with personal evangelism, and all of us could do a better job with a heart for missions.
-
const result = await cleanTranscript({
-
transcriptId: "test-123",
-
rawTranscript,
-
});
+
So I'm not up here talking to you about something I have conquered or mastered. I'm not the expert on this. In fact, when it comes to personal evangelism in my own strength, I'm often a complete failure. But I have found that even in my weakness, God can use me in powerful ways when I make myself available to Him.`;
-
// Check that tags are removed
-
expect(result.cleanedTranscript).not.toContain("[SIDE CONVERSATION]");
-
expect(result.cleanedTranscript).not.toContain("[inaudible]");
-
expect(result.cleanedTranscript).not.toContain("[background chatter]");
+
// Create mock segments from raw transcript (simulating whisper output)
+
const sentences = rawTranscript.split(/\.\s+/);
+
const mockSegments: { index?: number; start?: number; end?: number; text: string }[] = [];
+
let timeOffset = 0;
+
for (let i = 0; i < sentences.length; i++) {
+
const sentence = sentences[i]?.trim();
+
if (!sentence) continue;
+
const duration = sentence.split(/\s+/).length * 0.3; // ~0.3s per word
+
mockSegments.push({
+
index: i,
+
start: timeOffset,
+
end: timeOffset + duration,
+
text: sentence,
+
});
+
timeOffset += duration;
+
}
+
+
const result = await cleanAndGetParagraphBoundaries({
+
transcriptId: "test-123",
+
rawTranscript,
+
segments: mockSegments,
+
maxWordsMove: 3,
+
});
+
+
// Check that we got a result
+
expect(result.paragraphs).toBeDefined();
+
expect(result.paragraphs!.length).toBeGreaterThan(1); // Should have multiple paragraphs
+
+
// Check that paragraphs have the expected structure
+
for (const para of result.paragraphs!) {
+
expect(para).toHaveProperty('startSegmentIndex');
+
expect(para).toHaveProperty('endSegmentIndex');
+
expect(para).toHaveProperty('text');
+
expect(para.text.length).toBeGreaterThan(0);
+
}
+
+
// The cleaned text should have tags removed
+
const cleanedText = result.paragraphs!.map(p => p.text).join(' ');
-
// Check that we got some text back
-
expect(result.cleanedTranscript.length).toBeGreaterThan(0);
-
expect(result.cleanedTranscript.length).toBeLessThan(rawTranscript.length);
+
expect(cleanedText).not.toContain("[SIDE CONVERSATION]");
+
expect(cleanedText.toLowerCase()).toContain("gospel");
+
expect(cleanedText.toLowerCase()).toContain("evangelism");
-
console.log("Original:", rawTranscript.substring(0, 100));
-
console.log("Cleaned:", result.cleanedTranscript.substring(0, 100));
+
console.log(`Detected ${result.paragraphs!.length} paragraphs from ${mockSegments.length} segments`);
+
console.log("First paragraph:", result.paragraphs![0]?.text.substring(0, 100) + "...");
+
console.log("Last paragraph:", result.paragraphs![result.paragraphs!.length - 1]?.text.substring(0, 100) + "...");
}, 30000); // 30s timeout for API call
-
test("cleanTranscript handles empty transcript", async () => {
-
const result = await cleanTranscript({
-
transcriptId: "test-empty",
-
rawTranscript: "",
-
});
+
test("cleanAndGetParagraphBoundaries handles empty transcript", async () => {
+
const result = await cleanAndGetParagraphBoundaries({
+
transcriptId: "test-empty",
+
rawTranscript: "",
+
segments: [],
+
maxWordsMove: 3,
+
});
-
expect(result.cleanedTranscript).toBe("");
+
expect(result.paragraphs).toEqual([]);
});
-
test("cleanTranscript falls back to raw transcript on API error", async () => {
-
const rawTranscript = "Test transcript";
+
test("cleanAndGetParagraphBoundaries returns error on missing API key", async () => {
+
const rawTranscript = "Test transcript";
-
// Test with missing API key (if it's actually set, this test might fail)
-
const originalKey = process.env.GEMINI_API_KEY;
-
delete process.env.GEMINI_API_KEY;
+
// Test with missing API key (if it's actually set, this test might fail)
+
const originalKey = process.env.OPENROUTER_API_KEY;
+
delete process.env.OPENROUTER_API_KEY;
-
const result = await cleanTranscript({
-
transcriptId: "test-fallback",
-
rawTranscript,
-
});
+
const result = await cleanAndGetParagraphBoundaries({
+
transcriptId: "test-fallback",
+
rawTranscript,
+
segments: [{ text: rawTranscript }],
+
maxWordsMove: 3,
+
});
-
expect(result.cleanedTranscript).toBe(rawTranscript);
-
expect(result.error).toBe("GEMINI_API_KEY not set");
+
expect(result.paragraphs).toBeUndefined();
+
expect(result.error).toBe("OPENROUTER_API_KEY not set");
-
// Restore key
-
if (originalKey) {
-
process.env.GEMINI_API_KEY = originalKey;
-
}
+
// Restore key
+
if (originalKey) {
+
process.env.OPENROUTER_API_KEY = originalKey;
+
}
});
+91 -84
src/lib/transcript-cleaner.ts
···
-
// Clean up transcripts using Gemini to remove tags and fix grammar
-
-
interface CleanTranscriptOptions {
-
transcriptId: string;
-
rawTranscript: string;
-
}
-
-
interface CleanTranscriptResult {
-
cleanedTranscript: string;
-
error?: string;
+
// Paragraph boundary detection using OpenRouter. Returns a JSON array of paragraph objects.
+
export interface ParagraphBoundary {
+
startSegmentIndex: number;
+
endSegmentIndex: number;
+
text: string;
+
// Optional: list of moved words for auditing
+
movedWords?: { word: string; fromSegmentIndex: number; toSegmentIndex: number }[];
}
-
/**
-
* Clean transcript using Gemini Flash 2.0 (cheapest model)
-
* Removes tags like [SIDE CONVERSATION], [inaudible], etc.
-
* Fixes grammar while preserving sentence structure
-
*/
-
export async function cleanTranscript({
+
// Cleans transcript and determines paragraph boundaries in one LLM request.
+
// Returns paragraph boundaries as JSON array.
+
export async function cleanAndGetParagraphBoundaries({
transcriptId,
rawTranscript,
-
}: CleanTranscriptOptions): Promise<CleanTranscriptResult> {
-
const apiKey = process.env.GEMINI_API_KEY;
+
segments,
+
maxWordsMove = 0,
+
}: {
+
transcriptId: string;
+
rawTranscript: string;
+
segments: { index?: number; start?: number; end?: number; text: string }[];
+
maxWordsMove?: number;
+
}): Promise<{ paragraphs?: ParagraphBoundary[]; error?: string }> {
+
// Skip processing if transcript is empty
+
if (!rawTranscript || rawTranscript.trim().length === 0) {
+
return { paragraphs: [] };
+
}
+
const apiKey = process.env.OPENROUTER_API_KEY;
+
const model = process.env.OPENROUTER_MODEL || "openrouter/polaris-alpha";
if (!apiKey) {
-
return {
-
cleanedTranscript: rawTranscript,
-
error: "GEMINI_API_KEY not set",
-
};
+
return { error: "OPENROUTER_API_KEY not set" };
}
-
// Skip cleaning if transcript is empty
-
if (!rawTranscript || rawTranscript.trim().length === 0) {
-
return {
-
cleanedTranscript: rawTranscript,
-
};
-
}
+
try {
+
const segmentsPayload = segments.map((s) => ({
+
index: s.index ?? null,
+
start: s.start ?? null,
+
end: s.end ?? null,
+
text: s.text ?? "",
+
}));
-
console.log(
-
`[TranscriptCleaner] Starting cleanup for ${transcriptId} (${rawTranscript.length} chars)`,
-
);
+
const prompt = `You are a transcript editor and paragrapher. Input: a list of original transcript segments with their index, start time (seconds), end time (seconds), and the RAW transcript text.
-
try {
-
const prompt = `You are a transcript editor. Clean up this transcript by:
+
Your task: First, clean the transcript by:
1. Removing ALL tags like [SIDE CONVERSATION], [inaudible], [background chatter], etc.
2. Fixing grammar and punctuation to make sentences readable
3. Preserving the original sentence structure and wording as much as possible
···
5. NOT adding any new content or changing the meaning
6. If there are obvious speaking mistakes then you can fix those (e.g. "we are going no wait sorry you should be doing")
-
Return ONLY the cleaned transcript text, nothing else.
+
Then, determine paragraph boundaries by grouping the cleaned segments into logical paragraphs. A paragraph represents a complete thought, topic, or idea. Create MULTIPLE paragraphs based on:
+
- Natural topic changes or shifts in the speaker's focus
+
- Pauses or transitions in the speech ("Now...", "So...", "Let me tell you...", "Alright...")
+
- Complete narrative beats or examples
+
- Typical spoken paragraph length (30-120 seconds / 5-20 segments)
+
+
CRITICAL: Each paragraph MUST end with a complete sentence. DO NOT break paragraphs mid-sentence.
+
+
RETURN ONLY a JSON array of objects, EXACTLY in this format (no additional text):
+
+
[ {"startSegmentIndex": <int>, "endSegmentIndex": <int>, "text": "<paragraph text>"}, ... ]
+
+
Rules for paragraphing:
+
- ALWAYS end paragraphs at sentence boundaries (after periods, question marks, or exclamation points)
+
- NEVER break a paragraph in the middle of a sentence
+
- Create AT LEAST one paragraph for every 30-60 seconds of speech (roughly 5-10 segments)
+
- DO NOT put the entire transcript in a single paragraph
+
- Paragraphs must reference original segment indexes
+
- Do not move words across segment boundaries
+
- Return the paragraphs in order and cover the entire cleaned transcript text without overlap or omission
+
+
Segments:
+
${JSON.stringify(segmentsPayload, null, 2)}
-
Transcript to clean:
+
Raw Transcript:
${rawTranscript}`;
const response = await fetch(
-
"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-exp:generateContent",
+
"https://openrouter.ai/api/v1/chat/completions",
{
method: "POST",
headers: {
"Content-Type": "application/json",
-
"x-goog-api-key": apiKey,
+
"Authorization": `Bearer ${apiKey}`,
+
"HTTP-Referer": "https://thistle.app",
+
"X-Title": "Thistle Transcription",
},
body: JSON.stringify({
-
contents: [
-
{
-
parts: [{ text: prompt }],
-
},
+
model,
+
messages: [
+
{ role: "user", content: prompt },
],
-
generationConfig: {
-
temperature: 0.3,
-
topK: 40,
-
topP: 0.95,
-
maxOutputTokens: 8192,
-
},
+
temperature: 0.0,
+
max_tokens: 8192,
}),
},
);
if (!response.ok) {
const errorText = await response.text();
-
console.error(
-
`[TranscriptCleaner] Gemini API error for ${transcriptId}:`,
-
errorText,
-
);
-
return {
-
cleanedTranscript: rawTranscript,
-
error: `Gemini API error: ${response.status}`,
-
};
+
console.error(`[Paragrapher] OpenRouter error for ${transcriptId}:`, errorText);
+
return { error: `OpenRouter API error: ${response.status}` };
}
const result = await response.json();
-
const cleanedText =
-
result.candidates?.[0]?.content?.parts?.[0]?.text?.trim();
+
const raw = result.choices?.[0]?.message?.content?.trim();
+
if (!raw) {
+
return { error: "Empty paragrapher response" };
+
}
-
if (!cleanedText) {
-
console.warn(
-
`[TranscriptCleaner] Empty response from Gemini for ${transcriptId}`,
-
);
-
return {
-
cleanedTranscript: rawTranscript,
-
error: "Empty response from Gemini",
-
};
+
let parsed: ParagraphBoundary[] | null = null;
+
try {
+
parsed = JSON.parse(raw) as ParagraphBoundary[];
+
} catch (e) {
+
// Attempt to extract JSON substring if model padded text
+
const firstBracket = raw.indexOf("[");
+
const lastBracket = raw.lastIndexOf("]");
+
if (firstBracket >= 0 && lastBracket > firstBracket) {
+
const substr = raw.substring(firstBracket, lastBracket + 1);
+
parsed = JSON.parse(substr) as ParagraphBoundary[];
+
}
}
-
const reduction = Math.round(
-
((rawTranscript.length - cleanedText.length) / rawTranscript.length) *
-
100,
-
);
-
console.log(
-
`[TranscriptCleaner] Completed for ${transcriptId}: ${rawTranscript.length} → ${cleanedText.length} chars (${reduction}% reduction)`,
-
);
+
if (!parsed || !Array.isArray(parsed)) {
+
return { error: "Failed to parse paragrapher JSON" };
+
}
-
return {
-
cleanedTranscript: cleanedText,
-
};
-
} catch (error) {
-
console.error(
-
`[TranscriptCleaner] Failed to clean ${transcriptId}:`,
-
error,
-
);
-
return {
-
cleanedTranscript: rawTranscript,
-
error: error instanceof Error ? error.message : "Unknown error",
-
};
+
return { paragraphs: parsed };
+
} catch (err) {
+
console.error("[Paragrapher] Exception:", err);
+
return { error: err instanceof Error ? err.message : "Unknown error" };
}
}
+13 -8
src/lib/transcription.ts
···
import { ErrorCode } from "./errors";
import { saveTranscriptVTT } from "./transcript-storage";
import { cleanVTT } from "./vtt-cleaner";
+
import { parseVTT } from "./vtt-cleaner";
// Constants
export const MAX_FILE_SIZE = 100 * 1024 * 1024; // 100MB
···
`${this.serviceUrl}/transcribe/${whisperJobId}?format=vtt`,
);
if (vttResponse.ok) {
-
const vttContent = await vttResponse.text();
-
const cleanedVTT = await cleanVTT(transcriptionId, vttContent);
-
await saveTranscriptVTT(transcriptionId, cleanedVTT);
-
}
+
const vttContent = await vttResponse.text();
+
const cleanedVTT = await cleanVTT(transcriptionId, vttContent);
+
await saveTranscriptVTT(transcriptionId, cleanedVTT);
+
this.updateTranscription(transcriptionId, {});
+
}
} catch (error) {
console.warn(
`[Transcription] Failed to fetch VTT for ${transcriptionId}:`,
···
status?: TranscriptionStatus;
progress?: number;
error_message?: string;
+
vttContent?: string;
},
) {
const updates: string[] = [];
···
updates.push("error_message = ?");
values.push(data.error_message);
}
+
updates.push("updated_at = ?");
values.push(Math.floor(Date.now() / 1000));
···
`${this.serviceUrl}/transcribe/${whisperJob.id}?format=vtt`,
);
if (vttResponse.ok) {
-
const vttContent = await vttResponse.text();
-
const cleanedVTT = await cleanVTT(transcriptionId, vttContent);
-
await saveTranscriptVTT(transcriptionId, cleanedVTT);
-
}
+
const vttContent = await vttResponse.text();
+
const cleanedVTT = await cleanVTT(transcriptionId, vttContent);
+
await saveTranscriptVTT(transcriptionId, cleanedVTT);
+
this.updateTranscription(transcriptionId, {});
+
}
} catch (error) {
console.warn(
`[Sync] Failed to fetch VTT for ${transcriptionId}:`,
+44
src/lib/vtt-cleaner.test.ts
···
import { test, expect } from "bun:test";
import { cleanVTT } from "./vtt-cleaner";
+
import { readFileSync } from "fs";
+
import { join } from "path";
const sampleVTT = `WEBVTT
···
expect(result).toBe(emptyVTT);
});
+
+
test("cleanVTT detects multiple paragraphs", async () => {
+
const multiParaVTT = `WEBVTT
+
+
Paragraph 1-1
+
00:00:00.000 --> 00:00:00.000
+
Again, thank you for the privilege to not only study here, but also to teach here. Jesus,
+
+
Paragraph 1-2
+
00:00:00.000 --> 00:00:00.000
+
thank you. All`;
+
+
const result = await cleanVTT("test-multi-para", multiParaVTT);
+
+
expect(result).toContain("Paragraph 1-1");
+
expect(result).toContain("Paragraph 2-1");
+
// Should have at least two paragraphs
+
const paraMatches = result.match(/Paragraph \d+-\d+/g);
+
expect(paraMatches?.length).toBeGreaterThan(1);
+
}, 30000);
+
+
test("cleanVTT with real transcription data", async () => {
+
const originalApiKey = process.env.OPENROUTER_API_KEY;
+
// Temporarily unset to force fallback
+
delete process.env.OPENROUTER_API_KEY;
+
+
try {
+
const vttPath = join(__dirname, "../../transcripts/d69d8076-598a-4fe5-8100-fe3eff47fcd6.vtt");
+
const realVTT = readFileSync(vttPath, "utf-8");
+
+
const result = await cleanVTT("real-test", realVTT);
+
+
expect(result).toContain("WEBVTT");
+
// Check that it has multiple paragraph numbers
+
const paraMatches = result.match(/Paragraph (\d+)-\d+/g);
+
const uniqueParas = new Set(paraMatches?.map(m => m.match(/Paragraph (\d+)/)?.[1]));
+
expect(uniqueParas.size).toBeGreaterThan(1);
+
console.log("Paragraphs found:", uniqueParas.size);
+
} finally {
+
process.env.OPENROUTER_API_KEY = originalApiKey;
+
}
+
}, 30000);
+147 -29
src/lib/vtt-cleaner.ts
···
// Parse and clean VTT files
-
import { cleanTranscript } from "./transcript-cleaner";
+
import type { ParagraphBoundary } from "./transcript-cleaner";
interface VTTSegment {
index?: number;
timestamp: string;
text: string;
+
start?: number;
+
end?: number;
}
/**
-
* Parse VTT content into segments
+
* Parse a VTT timestamp string (hh:mm:ss.mmm or mm:ss.mmm) into seconds
*/
-
function parseVTT(vttContent: string): VTTSegment[] {
+
function parseTimestampToSeconds(ts?: string): number {
+
if (!ts) return 0;
+
// ts expected like "00:00:09.039"
+
const parts = ts.split(":").map((p) => p.trim());
+
const hh = parts[0] ?? "0";
+
const mm = parts[1] ?? "0";
+
const ss = parts[2] ?? "0";
+
if (parts.length === 3) {
+
const seconds =
+
parseInt(hh, 10) * 3600 + parseInt(mm, 10) * 60 + parseFloat(ss);
+
return seconds;
+
} else if (parts.length === 2) {
+
return parseInt(mm, 10) * 60 + parseFloat(ss);
+
}
+
return 0;
+
}
+
+
/**
+
* Parse VTT content into segments, populating start/end in seconds
+
*/
+
export function parseVTT(vttContent: string): VTTSegment[] {
const lines = vttContent.split("\n");
const segments: VTTSegment[] = [];
let currentSegment: Partial<VTTSegment> = {};
···
if (!line) {
if (currentSegment.timestamp && currentSegment.text) {
+
// parse start/end
+
const match = /([\d:.]+)\s*-->\s*([\d:.]+)/.exec(
+
currentSegment.timestamp || "",
+
);
+
if (match) {
+
currentSegment.start = parseTimestampToSeconds(match[1]);
+
currentSegment.end = parseTimestampToSeconds(match[2]);
+
}
segments.push(currentSegment as VTTSegment);
currentSegment = {};
}
···
continue;
}
+
// Check if it's a cue id (before timestamp)
+
if (!currentSegment.timestamp && line && !line.includes("-->")) {
+
currentSegment.index = line;
+
continue;
+
}
+
// Check if it's a timestamp line
if (line.includes("-->")) {
currentSegment.timestamp = line;
// Next line(s) will be text
const textLines: string[] = [];
i++;
-
while (i < lines.length && lines[i]?.trim() && !lines[i]?.includes("-->")) {
+
while (
+
i < lines.length &&
+
lines[i]?.trim() &&
+
!lines[i]?.includes("-->")
+
) {
textLines.push(lines[i] || "");
i++;
}
···
// Add last segment if exists
if (currentSegment.timestamp && currentSegment.text) {
+
const match = /([\d:.]+)\s*-->\s*([\d:.]+)/.exec(
+
currentSegment.timestamp || "",
+
);
+
if (match?.[1] && match[2]) {
+
currentSegment.start = parseTimestampToSeconds(match[1]);
+
currentSegment.end = parseTimestampToSeconds(match[2]);
+
}
segments.push(currentSegment as VTTSegment);
}
···
}
/**
-
* Clean VTT text segments by removing tags and fixing grammar
+
* Clean VTT text segments by removing tags and fixing grammar.
+
* Additionally, merge cleaned segments into paragraph cues while preserving
+
* stable paragraph IDs (derived from first segment start time).
*/
export async function cleanVTT(
transcriptionId: string,
···
}
console.log(
-
`[VTTCleaner] Cleaning ${segments.length} segments for ${transcriptionId}`,
+
`[VTTCleaner] Processing ${segments.length} segments for ${transcriptionId}`,
);
-
// Combine all text for cleaning
+
// Combine all text for cleaning and paragraphing
const allText = segments.map((s) => s.text).join(" ");
-
const { cleanedTranscript, error } = await cleanTranscript({
-
transcriptId: transcriptionId,
-
rawTranscript: allText,
-
});
+
// Attempt LLM-driven cleaning and paragraphing in one request, fallback to deterministic rules
+
let paragraphBoundaries: ParagraphBoundary[] = [];
-
if (error) {
-
console.warn(`[VTTCleaner] Falling back to original VTT: ${error}`);
-
return vttContent;
+
try {
+
const { cleanAndGetParagraphBoundaries } = await import(
+
"./transcript-cleaner"
+
);
+
const result = await cleanAndGetParagraphBoundaries({
+
transcriptId: transcriptionId,
+
rawTranscript: allText,
+
segments: segments.map((s) => ({
+
index: s.index,
+
start: s.start,
+
end: s.end,
+
text: s.text,
+
})),
+
maxWordsMove: 0,
+
});
+
+
if (result?.paragraphs) {
+
paragraphBoundaries = result.paragraphs;
+
}
+
} catch (e) {
+
console.warn(
+
"[VTTCleaner] Consolidated LLM failed, no paragraph detection:",
+
e,
+
);
}
-
// Split cleaned text back into segments
-
// Use simple word-based splitting proportional to original segment lengths
-
const words = cleanedTranscript.split(/\s+/);
-
const originalWords = allText.split(/\s+/);
-
const ratio = words.length / originalWords.length;
+
if (paragraphBoundaries.length === 0) {
+
// No paragraphs detected, treat as one big paragraph
+
paragraphBoundaries = [
+
{
+
startSegmentIndex: 0,
+
endSegmentIndex: segments.length - 1,
+
text: allText,
+
},
+
];
+
}
+
+
// Get the full cleaned transcript from paragraphs
+
const cleanedTranscript = paragraphBoundaries.map((p) => p.text).join(" ");
+
+
// Split cleaned text back into segments proportionally (word-based)
+
const words = cleanedTranscript.split(/\s+/).filter(Boolean);
+
const originalWords = allText.split(/\s+/).filter(Boolean);
+
const ratio = words.length / Math.max(1, originalWords.length);
let wordIndex = 0;
const cleanedSegments: VTTSegment[] = [];
for (const segment of segments) {
-
const originalWordCount = segment.text.split(/\s+/).length;
+
const originalWordCount = Math.max(
+
1,
+
segment.text.split(/\s+/).filter(Boolean).length,
+
);
const newWordCount = Math.max(1, Math.round(originalWordCount * ratio));
const segmentWords = words.slice(wordIndex, wordIndex + newWordCount);
wordIndex += newWordCount;
cleanedSegments.push({
+
index: segment.index,
timestamp: segment.timestamp,
text: segmentWords.join(" "),
-
index: segment.index,
+
start: segment.start,
+
end: segment.end,
});
}
-
// Rebuild VTT
-
let output = "WEBVTT\n\n";
-
for (const segment of cleanedSegments) {
-
if (segment.index !== undefined) {
-
output += `${segment.index}\n`;
+
// If any remaining words, append to last segment
+
if (wordIndex < words.length && cleanedSegments.length > 0) {
+
const rest = words.slice(wordIndex).join(" ");
+
const lastIdx = cleanedSegments.length - 1;
+
const lastSeg = cleanedSegments[lastIdx];
+
if (lastSeg) {
+
lastSeg.text += (lastSeg.text ? " " : "") + rest;
}
-
output += `${segment.timestamp}\n`;
-
output += `${segment.text}\n\n`;
}
-
console.log(`[VTTCleaner] Completed for ${transcriptionId}`);
+
// Assign paragraph-based IDs to segments
+
for (let i = 0; i < cleanedSegments.length; i++) {
+
const seg = cleanedSegments[i];
+
if (!seg) continue;
+
+
// Find which paragraph this segment belongs to
+
let paraIndex = 0;
+
let segmentInPara = 1;
+
for (let p = 0; p < paragraphBoundaries.length; p++) {
+
const para = paragraphBoundaries[p];
+
if (i >= para.startSegmentIndex && i <= para.endSegmentIndex) {
+
paraIndex = p + 1;
+
segmentInPara = i - para.startSegmentIndex + 1;
+
break;
+
}
+
}
+
+
// Use paragraph-based ID: "Paragraph N-M" where N is paragraph number, M is segment within paragraph
+
seg.index = `Paragraph ${paraIndex}-${segmentInPara}`;
+
}
+
+
// Build output VTT with cleaned segment cues having paragraph-based IDs
+
let output = "WEBVTT\n\n";
+
for (const seg of cleanedSegments) {
+
if (!seg || !seg.timestamp || !seg.text) continue;
+
output += `${seg.index}\n`;
+
output += `${seg.timestamp}\n`;
+
output += `${seg.text}\n\n`;
+
}
+
+
console.log(
+
`[VTTCleaner] Completed for ${transcriptionId}: ${cleanedSegments.length} segments in ${paragraphBoundaries.length} paragraphs`,
+
);
return output;
}