馃 distributed transcription service thistle.dunkirk.sh
1// Paragraph boundary detection using OpenRouter. Returns a JSON array of paragraph objects. 2export interface ParagraphBoundary { 3 startSegmentIndex: number; 4 endSegmentIndex: number; 5 text: string; 6 // Optional: list of moved words for auditing 7 movedWords?: { word: string; fromSegmentIndex: number; toSegmentIndex: number }[]; 8} 9 10// Cleans transcript and determines paragraph boundaries in one LLM request. 11// Returns paragraph boundaries as JSON array. 12export async function cleanAndGetParagraphBoundaries({ 13 transcriptId, 14 rawTranscript, 15 segments, 16 maxWordsMove = 0, 17}: { 18 transcriptId: string; 19 rawTranscript: string; 20 segments: { index?: number; start?: number; end?: number; text: string }[]; 21 maxWordsMove?: number; 22}): Promise<{ paragraphs?: ParagraphBoundary[]; error?: string }> { 23 // Skip processing if transcript is empty 24 if (!rawTranscript || rawTranscript.trim().length === 0) { 25 return { paragraphs: [] }; 26 } 27 28 const apiKey = process.env.OPENROUTER_API_KEY; 29 const model = process.env.OPENROUTER_MODEL || "openrouter/polaris-alpha"; 30 if (!apiKey) { 31 return { error: "OPENROUTER_API_KEY not set" }; 32 } 33 34 try { 35 const segmentsPayload = segments.map((s) => ({ 36 index: s.index ?? null, 37 start: s.start ?? null, 38 end: s.end ?? null, 39 text: s.text ?? "", 40 })); 41 42 const prompt = `You are a transcript editor and paragrapher. Input: a list of original transcript segments with their index, start time (seconds), end time (seconds), and the RAW transcript text. 43 44Your task: First, clean the transcript by: 451. Removing ALL tags like [SIDE CONVERSATION], [inaudible], [background chatter], etc. 462. Fixing grammar and punctuation to make sentences readable 473. Preserving the original sentence structure and wording as much as possible 484. Fixing obvious speech recognition errors (e.g., "gr..." should be "grade") 495. NOT adding any new content or changing the meaning 506. If there are obvious speaking mistakes then you can fix those (e.g. "we are going no wait sorry you should be doing") 51 52Then, determine paragraph boundaries by grouping the cleaned segments into logical paragraphs. A paragraph represents a complete thought, topic, or idea. Create MULTIPLE paragraphs based on: 53- Natural topic changes or shifts in the speaker's focus 54- Pauses or transitions in the speech ("Now...", "So...", "Let me tell you...", "Alright...") 55- Complete narrative beats or examples 56- Typical spoken paragraph length (30-120 seconds / 5-20 segments) 57 58CRITICAL: Each paragraph MUST end with a complete sentence. DO NOT break paragraphs mid-sentence. 59 60RETURN ONLY a JSON array of objects, EXACTLY in this format (no additional text): 61 62[ {"startSegmentIndex": <int>, "endSegmentIndex": <int>, "text": "<paragraph text>"}, ... ] 63 64Rules for paragraphing: 65- ALWAYS end paragraphs at sentence boundaries (after periods, question marks, or exclamation points) 66- NEVER break a paragraph in the middle of a sentence 67- Create AT LEAST one paragraph for every 30-60 seconds of speech (roughly 5-10 segments) 68- DO NOT put the entire transcript in a single paragraph 69- Paragraphs must reference original segment indexes 70- Do not move words across segment boundaries 71- Return the paragraphs in order and cover the entire cleaned transcript text without overlap or omission 72 73Segments: 74${JSON.stringify(segmentsPayload, null, 2)} 75 76Raw Transcript: 77${rawTranscript}`; 78 79 const response = await fetch( 80 "https://openrouter.ai/api/v1/chat/completions", 81 { 82 method: "POST", 83 headers: { 84 "Content-Type": "application/json", 85 "Authorization": `Bearer ${apiKey}`, 86 "HTTP-Referer": "https://thistle.app", 87 "X-Title": "Thistle Transcription", 88 }, 89 body: JSON.stringify({ 90 model, 91 messages: [ 92 { role: "user", content: prompt }, 93 ], 94 temperature: 0.0, 95 max_tokens: 8192, 96 }), 97 }, 98 ); 99 100 if (!response.ok) { 101 const errorText = await response.text(); 102 console.error(`[Paragrapher] OpenRouter error for ${transcriptId}:`, errorText); 103 return { error: `OpenRouter API error: ${response.status}` }; 104 } 105 106 const result = await response.json(); 107 const raw = result.choices?.[0]?.message?.content?.trim(); 108 if (!raw) { 109 return { error: "Empty paragrapher response" }; 110 } 111 112 let parsed: ParagraphBoundary[] | null = null; 113 try { 114 parsed = JSON.parse(raw) as ParagraphBoundary[]; 115 } catch (e) { 116 // Attempt to extract JSON substring if model padded text 117 const firstBracket = raw.indexOf("["); 118 const lastBracket = raw.lastIndexOf("]"); 119 if (firstBracket >= 0 && lastBracket > firstBracket) { 120 const substr = raw.substring(firstBracket, lastBracket + 1); 121 parsed = JSON.parse(substr) as ParagraphBoundary[]; 122 } 123 } 124 125 if (!parsed || !Array.isArray(parsed)) { 126 return { error: "Failed to parse paragrapher JSON" }; 127 } 128 129 return { paragraphs: parsed }; 130 } catch (err) { 131 console.error("[Paragrapher] Exception:", err); 132 return { error: err instanceof Error ? err.message : "Unknown error" }; 133 } 134}