馃 distributed transcription service
thistle.dunkirk.sh
1// Paragraph boundary detection using OpenRouter. Returns a JSON array of paragraph objects.
2export interface ParagraphBoundary {
3 startSegmentIndex: number;
4 endSegmentIndex: number;
5 text: string;
6 // Optional: list of moved words for auditing
7 movedWords?: { word: string; fromSegmentIndex: number; toSegmentIndex: number }[];
8}
9
10// Cleans transcript and determines paragraph boundaries in one LLM request.
11// Returns paragraph boundaries as JSON array.
12export async function cleanAndGetParagraphBoundaries({
13 transcriptId,
14 rawTranscript,
15 segments,
16 maxWordsMove = 0,
17}: {
18 transcriptId: string;
19 rawTranscript: string;
20 segments: { index?: number; start?: number; end?: number; text: string }[];
21 maxWordsMove?: number;
22}): Promise<{ paragraphs?: ParagraphBoundary[]; error?: string }> {
23 // Skip processing if transcript is empty
24 if (!rawTranscript || rawTranscript.trim().length === 0) {
25 return { paragraphs: [] };
26 }
27
28 const apiKey = process.env.OPENROUTER_API_KEY;
29 const model = process.env.OPENROUTER_MODEL || "openrouter/polaris-alpha";
30 if (!apiKey) {
31 return { error: "OPENROUTER_API_KEY not set" };
32 }
33
34 try {
35 const segmentsPayload = segments.map((s) => ({
36 index: s.index ?? null,
37 start: s.start ?? null,
38 end: s.end ?? null,
39 text: s.text ?? "",
40 }));
41
42 const prompt = `You are a transcript editor and paragrapher. Input: a list of original transcript segments with their index, start time (seconds), end time (seconds), and the RAW transcript text.
43
44Your task: First, clean the transcript by:
451. Removing ALL tags like [SIDE CONVERSATION], [inaudible], [background chatter], etc.
462. Fixing grammar and punctuation to make sentences readable
473. Preserving the original sentence structure and wording as much as possible
484. Fixing obvious speech recognition errors (e.g., "gr..." should be "grade")
495. NOT adding any new content or changing the meaning
506. If there are obvious speaking mistakes then you can fix those (e.g. "we are going no wait sorry you should be doing")
51
52Then, determine paragraph boundaries by grouping the cleaned segments into logical paragraphs. A paragraph represents a complete thought, topic, or idea. Create MULTIPLE paragraphs based on:
53- Natural topic changes or shifts in the speaker's focus
54- Pauses or transitions in the speech ("Now...", "So...", "Let me tell you...", "Alright...")
55- Complete narrative beats or examples
56- Typical spoken paragraph length (30-120 seconds / 5-20 segments)
57
58CRITICAL: Each paragraph MUST end with a complete sentence. DO NOT break paragraphs mid-sentence.
59
60RETURN ONLY a JSON array of objects, EXACTLY in this format (no additional text):
61
62[ {"startSegmentIndex": <int>, "endSegmentIndex": <int>, "text": "<paragraph text>"}, ... ]
63
64Rules for paragraphing:
65- ALWAYS end paragraphs at sentence boundaries (after periods, question marks, or exclamation points)
66- NEVER break a paragraph in the middle of a sentence
67- Create AT LEAST one paragraph for every 30-60 seconds of speech (roughly 5-10 segments)
68- DO NOT put the entire transcript in a single paragraph
69- Paragraphs must reference original segment indexes
70- Do not move words across segment boundaries
71- Return the paragraphs in order and cover the entire cleaned transcript text without overlap or omission
72
73Segments:
74${JSON.stringify(segmentsPayload, null, 2)}
75
76Raw Transcript:
77${rawTranscript}`;
78
79 const response = await fetch(
80 "https://openrouter.ai/api/v1/chat/completions",
81 {
82 method: "POST",
83 headers: {
84 "Content-Type": "application/json",
85 "Authorization": `Bearer ${apiKey}`,
86 "HTTP-Referer": "https://thistle.app",
87 "X-Title": "Thistle Transcription",
88 },
89 body: JSON.stringify({
90 model,
91 messages: [
92 { role: "user", content: prompt },
93 ],
94 temperature: 0.0,
95 max_tokens: 8192,
96 }),
97 },
98 );
99
100 if (!response.ok) {
101 const errorText = await response.text();
102 console.error(`[Paragrapher] OpenRouter error for ${transcriptId}:`, errorText);
103 return { error: `OpenRouter API error: ${response.status}` };
104 }
105
106 const result = await response.json();
107 const raw = result.choices?.[0]?.message?.content?.trim();
108 if (!raw) {
109 return { error: "Empty paragrapher response" };
110 }
111
112 let parsed: ParagraphBoundary[] | null = null;
113 try {
114 parsed = JSON.parse(raw) as ParagraphBoundary[];
115 } catch (e) {
116 // Attempt to extract JSON substring if model padded text
117 const firstBracket = raw.indexOf("[");
118 const lastBracket = raw.lastIndexOf("]");
119 if (firstBracket >= 0 && lastBracket > firstBracket) {
120 const substr = raw.substring(firstBracket, lastBracket + 1);
121 parsed = JSON.parse(substr) as ParagraphBoundary[];
122 }
123 }
124
125 if (!parsed || !Array.isArray(parsed)) {
126 return { error: "Failed to parse paragrapher JSON" };
127 }
128
129 return { paragraphs: parsed };
130 } catch (err) {
131 console.error("[Paragrapher] Exception:", err);
132 return { error: err instanceof Error ? err.message : "Unknown error" };
133 }
134}