🪻 distributed transcription service thistle.dunkirk.sh

chore: update the vtt cleaner

dunkirk.sh 17206359 c436bc56

verified
Changed files
+240 -52
src
+240 -52
src/lib/vtt-cleaner.ts
···
}
/**
-
* Clean VTT text using AI to create paragraph-separated VTT file.
-
* Uses OpenRouter API to intelligently group segments into paragraphs
-
* while preserving timing information.
+
* Chunk size for VTT processing
*/
-
export async function cleanVTT(
-
transcriptionId: string,
-
vttContent: string,
-
): Promise<string> {
-
const segments = parseVTT(vttContent);
+
const CHUNK_SIZE = 40; // Segments per chunk
-
if (segments.length === 0) {
-
return vttContent;
+
/**
+
* Find paragraph boundaries in processed VTT content
+
* Returns the segments in the last paragraph and highest paragraph number found
+
*/
+
function extractLastParagraphAndHighestNumber(vttContent: string): {
+
segments: string,
+
paragraphNumber: string | null,
+
highestParagraphNumber: number
+
} {
+
if (!vttContent) return { segments: '', paragraphNumber: null, highestParagraphNumber: 0 };
+
+
// Split into segments (separated by double newline)
+
const segments = vttContent.split('\n\n').filter(Boolean);
+
if (segments.length === 0) return { segments: '', paragraphNumber: null, highestParagraphNumber: 0 };
+
+
// Get all segments from the last paragraph number
+
const lastSegments: string[] = [];
+
let currentParagraphNumber: string | null = null;
+
let highestParagraphNumber = 0;
+
+
// First, scan through all segments to find the highest paragraph number
+
for (const segment of segments) {
+
if (!segment) continue;
+
+
const lines = segment.split('\n');
+
const firstLine = lines[0] || '';
+
+
// Check for paragraph number pattern
+
const paragraphMatch = /Paragraph (\d+)-\d+/.exec(firstLine);
+
if (paragraphMatch?.[1]) {
+
const paragraphNum = parseInt(paragraphMatch[1], 10);
+
if (!Number.isNaN(paragraphNum) && paragraphNum > highestParagraphNumber) {
+
highestParagraphNumber = paragraphNum;
+
}
+
}
}
-
-
console.log(
-
`[VTTCleaner] Processing ${segments.length} segments for ${transcriptionId}`,
-
);
-
-
const apiKey = process.env.LLM_API_KEY;
-
const apiBaseUrl = process.env.LLM_API_BASE_URL;
-
const model = process.env.LLM_MODEL;
-
if (!apiKey || !apiBaseUrl || !model) {
-
console.warn("[VTTCleaner] LLM configuration incomplete (need LLM_API_KEY, LLM_API_BASE_URL, LLM_MODEL), returning uncleaned VTT");
-
return vttContent;
+
// Start from the end and work backwards to find the last paragraph
+
for (let i = segments.length - 1; i >= 0; i--) {
+
const segment = segments[i];
+
if (!segment) continue;
+
+
const lines = segment.split('\n');
+
const firstLine = lines[0] || '';
+
+
// Check for paragraph number pattern
+
const paragraphMatch = /Paragraph (\d+)-\d+/.exec(firstLine);
+
if (paragraphMatch?.[1]) {
+
const paragraphNumber = paragraphMatch[1];
+
+
if (!currentParagraphNumber) {
+
// This is the first paragraph number we've found working backwards
+
currentParagraphNumber = paragraphNumber;
+
lastSegments.unshift(segment);
+
} else if (paragraphNumber === currentParagraphNumber) {
+
// Same paragraph, add it
+
lastSegments.unshift(segment);
+
} else {
+
// Different paragraph, we're done
+
break;
+
}
+
} else {
+
// No paragraph number, but might be part of current paragraph
+
// Add it if we've already started collecting segments
+
if (currentParagraphNumber) {
+
lastSegments.unshift(segment);
+
}
+
}
}
+
+
return {
+
segments: lastSegments.join('\n\n'),
+
paragraphNumber: currentParagraphNumber,
+
highestParagraphNumber
+
};
+
}
-
try {
-
// Build the input for the AI
-
const inputSegments = segments.map((seg, idx) => ({
-
index: idx,
-
timestamp: seg.timestamp,
-
text: seg.text,
-
}));
-
-
const prompt = `Can you turn this into a paragraph separated vtt file?
+
/**
+
* Process a chunk of VTT segments using AI
+
*/
+
async function processVTTChunk(
+
transcriptionId: string,
+
inputSegments: Array<{index: number, timestamp: string, text: string}>,
+
chunkIndex: number,
+
previousParagraphNumber: string | null,
+
apiKey: string,
+
apiBaseUrl: string,
+
model: string,
+
previousParagraphText?: string,
+
): Promise<string> {
+
const chunkId = `${transcriptionId}-chunk${chunkIndex}`;
+
+
const hasTextContext = !!previousParagraphText;
+
+
console.log(`[VTTCleaner] Processing chunk ${chunkIndex} with ${inputSegments.length} segments${hasTextContext ? ' and previous paragraph text context' : ''}`);
+
+
const nextParagraphNumber = previousParagraphNumber ? String(parseInt(previousParagraphNumber, 10) + 1) : '1';
+
+
const prompt = `Can you turn this into a paragraph separated vtt file?
Use the format "Paragraph X-Y" where X is the paragraph number and Y is the segment number within that paragraph:
···
I want you to preserve sentences across paragraph breaks moving whatever is the smallest amount out to its own segment block.
+
Here are important guidelines for forming paragraphs:
+
1. Create a new paragraph when there's a change in topic or speaker.
+
2. Don't make paragraphs too long - aim for 4-5 sentences per paragraph maximum.
+
3. Group related thoughts together in the same paragraph.
+
4. Start a new paragraph when a sentence introduces a completely new idea.
+
5. Focus on the number of sentences, not segments, when creating paragraphs.
+
6. The number of segments in a paragraph may vary, but keep paragraphs to a reasonable length.
+
Also go through and rewrite the words to extract the meaning and not necessarily the exact phrasing if it sounds unnatural when written. I want the text to remain lined up with the original though so don't rewrite entire paragraphs but you can remove ums, alrights, and similar. Also remove all contextual tags like [background noise]. Add punctuation if it's missing to make the text readable. If there is no more context to fit a segment then just skip it and move to the next one.
-
Input segments:
+
${hasTextContext ?
+
`The following is the last paragraph from the previous chunk and is provided for context only. DO NOT include it in your output - it's already in the transcript:
+
+
${previousParagraphText}
+
+
Now process the following new segments, continuing from the previous paragraph. ${previousParagraphNumber ? `Start your paragraphs with number ${nextParagraphNumber} (unless you're continuing the previous paragraph).` : ''}`
+
: 'Process the following segments:'}
+
${JSON.stringify(inputSegments, null, 2)}
-
Return ONLY the VTT content starting with "WEBVTT" and nothing else. No explanations or additional text.`;
+
Return ONLY the VTT content WITHOUT the "WEBVTT" header and nothing else. No explanations or additional text.`;
+
try {
const response = await fetch(
`${apiBaseUrl}/chat/completions`,
{
···
"Content-Type": "application/json",
"Authorization": `Bearer ${apiKey}`,
"HTTP-Referer": "https://thistle.app",
-
"X-Title": "Thistle Transcription",
+
"X-Title": `Thistle Transcription Chunk ${chunkIndex}`,
},
body: JSON.stringify({
model,
···
{ role: "user", content: prompt },
],
temperature: 0.3,
-
max_tokens: 16384,
+
max_tokens: 8192, // Reduced for chunks
}),
},
);
if (!response.ok) {
const errorText = await response.text();
-
console.error(`[VTTCleaner] OpenRouter error for ${transcriptionId}:`, errorText);
-
console.warn("[VTTCleaner] Falling back to uncleaned VTT");
-
return vttContent;
+
console.error(`[VTTCleaner] OpenRouter error for ${chunkId}:`, errorText);
+
throw new Error(`API error: ${response.status}`);
}
const result = await response.json();
const cleanedVTT = result.choices?.[0]?.message?.content?.trim();
if (!cleanedVTT) {
-
console.warn("[VTTCleaner] Empty response from AI, returning uncleaned VTT");
-
return vttContent;
+
throw new Error("Empty response from AI");
}
// Extract VTT content if the model wrapped it in markdown
-
let finalVTT = cleanedVTT;
+
let chunkVTT = cleanedVTT;
if (cleanedVTT.includes("```")) {
const vttMatch = cleanedVTT.match(/```(?:vtt)?\n([\s\S]*?)```/);
if (vttMatch?.[1]) {
-
finalVTT = vttMatch[1].trim();
+
chunkVTT = vttMatch[1].trim();
}
}
-
// Ensure it starts with WEBVTT
-
if (!finalVTT.startsWith("WEBVTT")) {
-
const webvttIndex = finalVTT.indexOf("WEBVTT");
-
if (webvttIndex !== -1) {
-
finalVTT = finalVTT.substring(webvttIndex);
-
} else {
-
finalVTT = `WEBVTT\n\n${finalVTT}`;
+
// Remove WEBVTT header if present (we'll add it once at the end)
+
if (chunkVTT.startsWith("WEBVTT")) {
+
const lines = chunkVTT.split("\n");
+
// Skip WEBVTT line and any blank lines that follow
+
let i = 1;
+
while (i < lines.length && !lines[i]?.trim()) {
+
i++;
}
+
chunkVTT = lines.slice(i).join("\n");
}
+
console.log(`[VTTCleaner] Successfully processed chunk ${chunkIndex}`);
+
return chunkVTT;
+
} catch (error) {
+
console.error(`[VTTCleaner] Exception in chunk ${chunkIndex}:`, error);
+
throw error;
+
}
+
}
+
+
/**
+
* Clean VTT text using AI to create paragraph-separated VTT file.
+
* Uses OpenRouter API to intelligently group segments into paragraphs
+
* while preserving timing information. Processes sequentially in chunks
+
* with context from previous chunks to maintain paragraph continuity.
+
*/
+
export async function cleanVTT(
+
transcriptionId: string,
+
vttContent: string,
+
): Promise<string> {
+
const segments = parseVTT(vttContent);
+
+
if (segments.length === 0) {
+
return vttContent;
+
}
+
+
console.log(
+
`[VTTCleaner] Processing ${segments.length} segments for ${transcriptionId}`,
+
);
+
+
const apiKey = process.env.LLM_API_KEY;
+
const apiBaseUrl = process.env.LLM_API_BASE_URL;
+
const model = process.env.LLM_MODEL;
+
+
if (!apiKey || !apiBaseUrl || !model) {
+
console.warn("[VTTCleaner] LLM configuration incomplete (need LLM_API_KEY, LLM_API_BASE_URL, LLM_MODEL), returning uncleaned VTT");
+
return vttContent;
+
}
+
+
try {
+
// Build the input segments
+
const inputSegments = segments.map((seg, idx) => ({
+
index: idx,
+
timestamp: seg.timestamp,
+
text: seg.text,
+
}));
+
+
// Prepare chunks for sequential processing
+
const chunks: Array<typeof inputSegments> = [];
+
for (let i = 0; i < inputSegments.length; i += CHUNK_SIZE) {
+
// Don't go beyond array bounds
+
const end = Math.min(i + CHUNK_SIZE, inputSegments.length);
+
chunks.push(inputSegments.slice(i, end));
+
}
+
+
console.log(`[VTTCleaner] Split into ${chunks.length} chunks for sequential processing with paragraph context`);
+
+
// Process chunks sequentially with context from previous chunk
+
const processedChunks: string[] = [];
+
let previousParagraphText: string | undefined;
+
let previousParagraphNumber: string | null = null;
+
+
for (let i = 0; i < chunks.length; i++) {
+
const chunk = chunks[i];
+
if (!chunk || chunk.length === 0) continue;
+
+
try {
+
const processedChunk = await processVTTChunk(
+
transcriptionId,
+
chunk,
+
i,
+
previousParagraphNumber,
+
apiKey,
+
apiBaseUrl,
+
model,
+
previousParagraphText
+
);
+
processedChunks.push(processedChunk);
+
console.log(`[VTTCleaner] Completed chunk ${i}/${chunks.length - 1}${previousParagraphText ? ' (with context)' : ''}`);
+
+
// Extract context for the next chunk
+
if (i < chunks.length - 1) {
+
const { segments: lastParagraphText, paragraphNumber, highestParagraphNumber } = extractLastParagraphAndHighestNumber(processedChunk);
+
+
if (lastParagraphText) {
+
console.log(`[VTTCleaner] Using paragraph ${paragraphNumber || 'unknown'} as context for next chunk (highest paragraph: ${highestParagraphNumber})`);
+
previousParagraphText = lastParagraphText;
+
previousParagraphNumber = highestParagraphNumber.toString();
+
} else {
+
previousParagraphText = undefined;
+
previousParagraphNumber = null;
+
}
+
}
+
} catch (error) {
+
console.error(`[VTTCleaner] Chunk ${i} failed:`, error);
+
// Return the original segments for this chunk if processing fails
+
const fallbackChunk = chunk.map(seg =>
+
`${seg.index || ''}\n${seg.timestamp}\n${seg.text}`
+
).join('\n\n');
+
processedChunks.push(fallbackChunk);
+
previousParagraphText = undefined;
+
previousParagraphNumber = null;
+
}
+
}
+
+
// Combine all processed chunks
+
const finalVTT = `WEBVTT\n\n${processedChunks.join('\n\n')}`;
+
console.log(
-
`[VTTCleaner] Successfully cleaned ${segments.length} segments using AI`,
+
`[VTTCleaner] Successfully cleaned ${segments.length} segments in ${chunks.length} sequential chunks with paragraph context`,
);
return finalVTT;
-
} catch (err) {
-
console.error("[VTTCleaner] Exception:", err);
+
} catch (error) {
+
console.error("[VTTCleaner] Exception:", error);
console.warn("[VTTCleaner] Falling back to uncleaned VTT");
return vttContent;
}
-
}
+
}