🪻 distributed transcription service thistle.dunkirk.sh

chore: remove the legacy transcript cleaner

dunkirk.sh 12c8fdc6 e8bd6888

verified
-96
src/lib/transcript-cleaner.test.ts
···
-
import { test, expect } from "bun:test";
-
import { cleanAndGetParagraphBoundaries } from "./transcript-cleaner";
-
-
// AI integration test - skip by default to avoid burning credits
-
// Run with: bun test src/lib/transcript-cleaner.test.ts --test-name-pattern "AI"
-
test.skip("cleanAndGetParagraphBoundaries cleans transcript and returns paragraph boundaries", async () => {
-
// Use a longer, more realistic transcript sample with natural paragraph breaks
-
const rawTranscript = `[SIDE CONVERSATION] Today in chapel we are talking about the fact that we believe in having gospel conversations. I'm gonna run my own PowerPoint. I'm gonna jump around. It's gonna be a little more conversational than normal. It's not gonna be like one of the normal sermons, although I know me and my tendency it'll turn into a sermon at some point just because that's the way God made me, so I can't help it.
-
-
Alright, so when it starts just have fun with it. We'll go on. Here's what it says in our doctrinal statement. It says, "Due to the commission of Christ and the urgency of the Gospel, all believers are to engage in Gospel conversations." How many of you believe that? That's pretty weak. How many of you believe that?
-
-
To live God-honoring lives and to work continuously for the spread of the Gospel to their neighbors and the nations. Now, let's be honest, as we start off this morning, all of us could do a better job with personal evangelism, and all of us could do a better job with a heart for missions.
-
-
So I'm not up here talking to you about something I have conquered or mastered. I'm not the expert on this. In fact, when it comes to personal evangelism in my own strength, I'm often a complete failure. But I have found that even in my weakness, God can use me in powerful ways when I make myself available to Him.`;
-
-
// Create mock segments from raw transcript (simulating whisper output)
-
const sentences = rawTranscript.split(/\.\s+/);
-
const mockSegments: { index?: number; start?: number; end?: number; text: string }[] = [];
-
let timeOffset = 0;
-
for (let i = 0; i < sentences.length; i++) {
-
const sentence = sentences[i]?.trim();
-
if (!sentence) continue;
-
const duration = sentence.split(/\s+/).length * 0.3; // ~0.3s per word
-
mockSegments.push({
-
index: i,
-
start: timeOffset,
-
end: timeOffset + duration,
-
text: sentence,
-
});
-
timeOffset += duration;
-
}
-
-
const result = await cleanAndGetParagraphBoundaries({
-
transcriptId: "test-123",
-
rawTranscript,
-
segments: mockSegments,
-
maxWordsMove: 3,
-
});
-
-
// Check that we got a result
-
expect(result.paragraphs).toBeDefined();
-
expect(result.paragraphs!.length).toBeGreaterThan(1); // Should have multiple paragraphs
-
-
// Check that paragraphs have the expected structure
-
for (const para of result.paragraphs!) {
-
expect(para).toHaveProperty('startSegmentIndex');
-
expect(para).toHaveProperty('endSegmentIndex');
-
expect(para).toHaveProperty('text');
-
expect(para.text.length).toBeGreaterThan(0);
-
}
-
-
// The cleaned text should have tags removed
-
const cleanedText = result.paragraphs!.map(p => p.text).join(' ');
-
-
expect(cleanedText).not.toContain("[SIDE CONVERSATION]");
-
expect(cleanedText.toLowerCase()).toContain("gospel");
-
expect(cleanedText.toLowerCase()).toContain("evangelism");
-
-
console.log(`Detected ${result.paragraphs!.length} paragraphs from ${mockSegments.length} segments`);
-
console.log("First paragraph:", result.paragraphs![0]?.text.substring(0, 100) + "...");
-
console.log("Last paragraph:", result.paragraphs![result.paragraphs!.length - 1]?.text.substring(0, 100) + "...");
-
}, 30000); // 30s timeout for API call
-
-
test("cleanAndGetParagraphBoundaries handles empty transcript", async () => {
-
const result = await cleanAndGetParagraphBoundaries({
-
transcriptId: "test-empty",
-
rawTranscript: "",
-
segments: [],
-
maxWordsMove: 3,
-
});
-
-
expect(result.paragraphs).toEqual([]);
-
});
-
-
test("cleanAndGetParagraphBoundaries returns error on missing API key", async () => {
-
const rawTranscript = "Test transcript";
-
-
// Test with missing API key (if it's actually set, this test might fail)
-
const originalKey = process.env.OPENROUTER_API_KEY;
-
delete process.env.OPENROUTER_API_KEY;
-
-
const result = await cleanAndGetParagraphBoundaries({
-
transcriptId: "test-fallback",
-
rawTranscript,
-
segments: [{ text: rawTranscript }],
-
maxWordsMove: 3,
-
});
-
-
expect(result.paragraphs).toBeUndefined();
-
expect(result.error).toBe("OPENROUTER_API_KEY not set");
-
-
// Restore key
-
if (originalKey) {
-
process.env.OPENROUTER_API_KEY = originalKey;
-
}
-
});
···
-134
src/lib/transcript-cleaner.ts
···
-
// Paragraph boundary detection using OpenRouter. Returns a JSON array of paragraph objects.
-
export interface ParagraphBoundary {
-
startSegmentIndex: number;
-
endSegmentIndex: number;
-
text: string;
-
// Optional: list of moved words for auditing
-
movedWords?: { word: string; fromSegmentIndex: number; toSegmentIndex: number }[];
-
}
-
-
// Cleans transcript and determines paragraph boundaries in one LLM request.
-
// Returns paragraph boundaries as JSON array.
-
export async function cleanAndGetParagraphBoundaries({
-
transcriptId,
-
rawTranscript,
-
segments,
-
maxWordsMove = 0,
-
}: {
-
transcriptId: string;
-
rawTranscript: string;
-
segments: { index?: number; start?: number; end?: number; text: string }[];
-
maxWordsMove?: number;
-
}): Promise<{ paragraphs?: ParagraphBoundary[]; error?: string }> {
-
// Skip processing if transcript is empty
-
if (!rawTranscript || rawTranscript.trim().length === 0) {
-
return { paragraphs: [] };
-
}
-
-
const apiKey = process.env.OPENROUTER_API_KEY;
-
const model = process.env.OPENROUTER_MODEL || "openrouter/polaris-alpha";
-
if (!apiKey) {
-
return { error: "OPENROUTER_API_KEY not set" };
-
}
-
-
try {
-
const segmentsPayload = segments.map((s) => ({
-
index: s.index ?? null,
-
start: s.start ?? null,
-
end: s.end ?? null,
-
text: s.text ?? "",
-
}));
-
-
const prompt = `You are a transcript editor and paragrapher. Input: a list of original transcript segments with their index, start time (seconds), end time (seconds), and the RAW transcript text.
-
-
Your task: First, clean the transcript by:
-
1. Removing ALL tags like [SIDE CONVERSATION], [inaudible], [background chatter], etc.
-
2. Fixing grammar and punctuation to make sentences readable
-
3. Preserving the original sentence structure and wording as much as possible
-
4. Fixing obvious speech recognition errors (e.g., "gr..." should be "grade")
-
5. NOT adding any new content or changing the meaning
-
6. If there are obvious speaking mistakes then you can fix those (e.g. "we are going no wait sorry you should be doing")
-
-
Then, determine paragraph boundaries by grouping the cleaned segments into logical paragraphs. A paragraph represents a complete thought, topic, or idea. Create MULTIPLE paragraphs based on:
-
- Natural topic changes or shifts in the speaker's focus
-
- Pauses or transitions in the speech ("Now...", "So...", "Let me tell you...", "Alright...")
-
- Complete narrative beats or examples
-
- Typical spoken paragraph length (30-120 seconds / 5-20 segments)
-
-
CRITICAL: Each paragraph MUST end with a complete sentence. DO NOT break paragraphs mid-sentence.
-
-
RETURN ONLY a JSON array of objects, EXACTLY in this format (no additional text):
-
-
[ {"startSegmentIndex": <int>, "endSegmentIndex": <int>, "text": "<paragraph text>"}, ... ]
-
-
Rules for paragraphing:
-
- ALWAYS end paragraphs at sentence boundaries (after periods, question marks, or exclamation points)
-
- NEVER break a paragraph in the middle of a sentence
-
- Create AT LEAST one paragraph for every 30-60 seconds of speech (roughly 5-10 segments)
-
- DO NOT put the entire transcript in a single paragraph
-
- Paragraphs must reference original segment indexes
-
- Do not move words across segment boundaries
-
- Return the paragraphs in order and cover the entire cleaned transcript text without overlap or omission
-
-
Segments:
-
${JSON.stringify(segmentsPayload, null, 2)}
-
-
Raw Transcript:
-
${rawTranscript}`;
-
-
const response = await fetch(
-
"https://openrouter.ai/api/v1/chat/completions",
-
{
-
method: "POST",
-
headers: {
-
"Content-Type": "application/json",
-
"Authorization": `Bearer ${apiKey}`,
-
"HTTP-Referer": "https://thistle.app",
-
"X-Title": "Thistle Transcription",
-
},
-
body: JSON.stringify({
-
model,
-
messages: [
-
{ role: "user", content: prompt },
-
],
-
temperature: 0.0,
-
max_tokens: 8192,
-
}),
-
},
-
);
-
-
if (!response.ok) {
-
const errorText = await response.text();
-
console.error(`[Paragrapher] OpenRouter error for ${transcriptId}:`, errorText);
-
return { error: `OpenRouter API error: ${response.status}` };
-
}
-
-
const result = await response.json();
-
const raw = result.choices?.[0]?.message?.content?.trim();
-
if (!raw) {
-
return { error: "Empty paragrapher response" };
-
}
-
-
let parsed: ParagraphBoundary[] | null = null;
-
try {
-
parsed = JSON.parse(raw) as ParagraphBoundary[];
-
} catch (e) {
-
// Attempt to extract JSON substring if model padded text
-
const firstBracket = raw.indexOf("[");
-
const lastBracket = raw.lastIndexOf("]");
-
if (firstBracket >= 0 && lastBracket > firstBracket) {
-
const substr = raw.substring(firstBracket, lastBracket + 1);
-
parsed = JSON.parse(substr) as ParagraphBoundary[];
-
}
-
}
-
-
if (!parsed || !Array.isArray(parsed)) {
-
return { error: "Failed to parse paragrapher JSON" };
-
}
-
-
return { paragraphs: parsed };
-
} catch (err) {
-
console.error("[Paragrapher] Exception:", err);
-
return { error: err instanceof Error ? err.message : "Unknown error" };
-
}
-
}
···