···
+
* Chunk size for VTT processing
+
const CHUNK_SIZE = 40; // Segments per chunk
+
* Find paragraph boundaries in processed VTT content
+
* Returns the segments in the last paragraph and highest paragraph number found
+
function extractLastParagraphAndHighestNumber(vttContent: string): {
+
paragraphNumber: string | null,
+
highestParagraphNumber: number
+
if (!vttContent) return { segments: '', paragraphNumber: null, highestParagraphNumber: 0 };
+
// Split into segments (separated by double newline)
+
const segments = vttContent.split('\n\n').filter(Boolean);
+
if (segments.length === 0) return { segments: '', paragraphNumber: null, highestParagraphNumber: 0 };
+
// Get all segments from the last paragraph number
+
const lastSegments: string[] = [];
+
let currentParagraphNumber: string | null = null;
+
let highestParagraphNumber = 0;
+
// First, scan through all segments to find the highest paragraph number
+
for (const segment of segments) {
+
if (!segment) continue;
+
const lines = segment.split('\n');
+
const firstLine = lines[0] || '';
+
// Check for paragraph number pattern
+
const paragraphMatch = /Paragraph (\d+)-\d+/.exec(firstLine);
+
if (paragraphMatch?.[1]) {
+
const paragraphNum = parseInt(paragraphMatch[1], 10);
+
if (!Number.isNaN(paragraphNum) && paragraphNum > highestParagraphNumber) {
+
highestParagraphNumber = paragraphNum;
+
// Start from the end and work backwards to find the last paragraph
+
for (let i = segments.length - 1; i >= 0; i--) {
+
const segment = segments[i];
+
if (!segment) continue;
+
const lines = segment.split('\n');
+
const firstLine = lines[0] || '';
+
// Check for paragraph number pattern
+
const paragraphMatch = /Paragraph (\d+)-\d+/.exec(firstLine);
+
if (paragraphMatch?.[1]) {
+
const paragraphNumber = paragraphMatch[1];
+
if (!currentParagraphNumber) {
+
// This is the first paragraph number we've found working backwards
+
currentParagraphNumber = paragraphNumber;
+
lastSegments.unshift(segment);
+
} else if (paragraphNumber === currentParagraphNumber) {
+
// Same paragraph, add it
+
lastSegments.unshift(segment);
+
// Different paragraph, we're done
+
// No paragraph number, but might be part of current paragraph
+
// Add it if we've already started collecting segments
+
if (currentParagraphNumber) {
+
lastSegments.unshift(segment);
+
segments: lastSegments.join('\n\n'),
+
paragraphNumber: currentParagraphNumber,
+
* Process a chunk of VTT segments using AI
+
async function processVTTChunk(
+
transcriptionId: string,
+
inputSegments: Array<{index: number, timestamp: string, text: string}>,
+
previousParagraphNumber: string | null,
+
previousParagraphText?: string,
+
const chunkId = `${transcriptionId}-chunk${chunkIndex}`;
+
const hasTextContext = !!previousParagraphText;
+
console.log(`[VTTCleaner] Processing chunk ${chunkIndex} with ${inputSegments.length} segments${hasTextContext ? ' and previous paragraph text context' : ''}`);
+
const nextParagraphNumber = previousParagraphNumber ? String(parseInt(previousParagraphNumber, 10) + 1) : '1';
+
const prompt = `Can you turn this into a paragraph separated vtt file?
Use the format "Paragraph X-Y" where X is the paragraph number and Y is the segment number within that paragraph:
···
I want you to preserve sentences across paragraph breaks moving whatever is the smallest amount out to its own segment block.
+
Here are important guidelines for forming paragraphs:
+
1. Create a new paragraph when there's a change in topic or speaker.
+
2. Don't make paragraphs too long - aim for 4-5 sentences per paragraph maximum.
+
3. Group related thoughts together in the same paragraph.
+
4. Start a new paragraph when a sentence introduces a completely new idea.
+
5. Focus on the number of sentences, not segments, when creating paragraphs.
+
6. The number of segments in a paragraph may vary, but keep paragraphs to a reasonable length.
Also go through and rewrite the words to extract the meaning and not necessarily the exact phrasing if it sounds unnatural when written. I want the text to remain lined up with the original though so don't rewrite entire paragraphs but you can remove ums, alrights, and similar. Also remove all contextual tags like [background noise]. Add punctuation if it's missing to make the text readable. If there is no more context to fit a segment then just skip it and move to the next one.
+
`The following is the last paragraph from the previous chunk and is provided for context only. DO NOT include it in your output - it's already in the transcript:
+
${previousParagraphText}
+
Now process the following new segments, continuing from the previous paragraph. ${previousParagraphNumber ? `Start your paragraphs with number ${nextParagraphNumber} (unless you're continuing the previous paragraph).` : ''}`
+
: 'Process the following segments:'}
${JSON.stringify(inputSegments, null, 2)}
+
Return ONLY the VTT content WITHOUT the "WEBVTT" header and nothing else. No explanations or additional text.`;
const response = await fetch(
`${apiBaseUrl}/chat/completions`,
···
"Content-Type": "application/json",
"Authorization": `Bearer ${apiKey}`,
"HTTP-Referer": "https://thistle.app",
+
"X-Title": `Thistle Transcription Chunk ${chunkIndex}`,
···
{ role: "user", content: prompt },
+
max_tokens: 8192, // Reduced for chunks
const errorText = await response.text();
+
console.error(`[VTTCleaner] OpenRouter error for ${chunkId}:`, errorText);
+
throw new Error(`API error: ${response.status}`);
const result = await response.json();
const cleanedVTT = result.choices?.[0]?.message?.content?.trim();
+
throw new Error("Empty response from AI");
// Extract VTT content if the model wrapped it in markdown
+
let chunkVTT = cleanedVTT;
if (cleanedVTT.includes("```")) {
const vttMatch = cleanedVTT.match(/```(?:vtt)?\n([\s\S]*?)```/);
+
chunkVTT = vttMatch[1].trim();
+
// Remove WEBVTT header if present (we'll add it once at the end)
+
if (chunkVTT.startsWith("WEBVTT")) {
+
const lines = chunkVTT.split("\n");
+
// Skip WEBVTT line and any blank lines that follow
+
while (i < lines.length && !lines[i]?.trim()) {
+
chunkVTT = lines.slice(i).join("\n");
+
console.log(`[VTTCleaner] Successfully processed chunk ${chunkIndex}`);
+
console.error(`[VTTCleaner] Exception in chunk ${chunkIndex}:`, error);
+
* Clean VTT text using AI to create paragraph-separated VTT file.
+
* Uses OpenRouter API to intelligently group segments into paragraphs
+
* while preserving timing information. Processes sequentially in chunks
+
* with context from previous chunks to maintain paragraph continuity.
+
export async function cleanVTT(
+
transcriptionId: string,
+
const segments = parseVTT(vttContent);
+
if (segments.length === 0) {
+
`[VTTCleaner] Processing ${segments.length} segments for ${transcriptionId}`,
+
const apiKey = process.env.LLM_API_KEY;
+
const apiBaseUrl = process.env.LLM_API_BASE_URL;
+
const model = process.env.LLM_MODEL;
+
if (!apiKey || !apiBaseUrl || !model) {
+
console.warn("[VTTCleaner] LLM configuration incomplete (need LLM_API_KEY, LLM_API_BASE_URL, LLM_MODEL), returning uncleaned VTT");
+
// Build the input segments
+
const inputSegments = segments.map((seg, idx) => ({
+
timestamp: seg.timestamp,
+
// Prepare chunks for sequential processing
+
const chunks: Array<typeof inputSegments> = [];
+
for (let i = 0; i < inputSegments.length; i += CHUNK_SIZE) {
+
// Don't go beyond array bounds
+
const end = Math.min(i + CHUNK_SIZE, inputSegments.length);
+
chunks.push(inputSegments.slice(i, end));
+
console.log(`[VTTCleaner] Split into ${chunks.length} chunks for sequential processing with paragraph context`);
+
// Process chunks sequentially with context from previous chunk
+
const processedChunks: string[] = [];
+
let previousParagraphText: string | undefined;
+
let previousParagraphNumber: string | null = null;
+
for (let i = 0; i < chunks.length; i++) {
+
const chunk = chunks[i];
+
if (!chunk || chunk.length === 0) continue;
+
const processedChunk = await processVTTChunk(
+
previousParagraphNumber,
+
processedChunks.push(processedChunk);
+
console.log(`[VTTCleaner] Completed chunk ${i}/${chunks.length - 1}${previousParagraphText ? ' (with context)' : ''}`);
+
// Extract context for the next chunk
+
if (i < chunks.length - 1) {
+
const { segments: lastParagraphText, paragraphNumber, highestParagraphNumber } = extractLastParagraphAndHighestNumber(processedChunk);
+
if (lastParagraphText) {
+
console.log(`[VTTCleaner] Using paragraph ${paragraphNumber || 'unknown'} as context for next chunk (highest paragraph: ${highestParagraphNumber})`);
+
previousParagraphText = lastParagraphText;
+
previousParagraphNumber = highestParagraphNumber.toString();
+
previousParagraphText = undefined;
+
previousParagraphNumber = null;
+
console.error(`[VTTCleaner] Chunk ${i} failed:`, error);
+
// Return the original segments for this chunk if processing fails
+
const fallbackChunk = chunk.map(seg =>
+
`${seg.index || ''}\n${seg.timestamp}\n${seg.text}`
+
processedChunks.push(fallbackChunk);
+
previousParagraphText = undefined;
+
previousParagraphNumber = null;
+
// Combine all processed chunks
+
const finalVTT = `WEBVTT\n\n${processedChunks.join('\n\n')}`;
+
`[VTTCleaner] Successfully cleaned ${segments.length} segments in ${chunks.length} sequential chunks with paragraph context`,
+
console.error("[VTTCleaner] Exception:", error);
console.warn("[VTTCleaner] Falling back to uncleaned VTT");