🪻 distributed transcription service thistle.dunkirk.sh

feat: use gemini to cleanup transcripts

dunkirk.sh a88b2813 6e888ef1

verified
+5
.env.example
···
# URL of the faster-whisper transcription server
# See README for setup instructions
WHISPER_SERVICE_URL=http://localhost:8000
···
# URL of the faster-whisper transcription server
# See README for setup instructions
WHISPER_SERVICE_URL=http://localhost:8000
+
+
# Gemini API Key (optional)
+
# For cleaning transcripts - removes tags and improves grammar
+
# Get your key from: https://aistudio.google.com/app/apikey
+
# GEMINI_API_KEY=your_api_key_here
+53
src/lib/transcript-cleaner.test.ts
···
···
+
import { test, expect } from "bun:test";
+
import { cleanTranscript } from "./transcript-cleaner";
+
+
test("cleanTranscript removes tags and fixes grammar", async () => {
+
const rawTranscript = `[SIDE CONVERSATION] Yes? So with this course packet, what quiz is and exams, and if I can study through here, what you talk about? And I give you a good review every time. Yeah, so I'd be good to just study that and then we can do it. Yeah, and all the examples and stuff that we get from class especially. And then I, like your first quiz, I give you a mock quiz exactly like the quiz. Oh, okay. so you can kind of get a feel for how I do things. [inaudible] Okay? [inaudible] Yeah. [background chatter]`;
+
+
const result = await cleanTranscript({
+
transcriptId: "test-123",
+
rawTranscript,
+
});
+
+
// Check that tags are removed
+
expect(result.cleanedTranscript).not.toContain("[SIDE CONVERSATION]");
+
expect(result.cleanedTranscript).not.toContain("[inaudible]");
+
expect(result.cleanedTranscript).not.toContain("[background chatter]");
+
+
// Check that we got some text back
+
expect(result.cleanedTranscript.length).toBeGreaterThan(0);
+
expect(result.cleanedTranscript.length).toBeLessThan(rawTranscript.length);
+
+
console.log("Original:", rawTranscript.substring(0, 100));
+
console.log("Cleaned:", result.cleanedTranscript.substring(0, 100));
+
}, 30000); // 30s timeout for API call
+
+
test("cleanTranscript handles empty transcript", async () => {
+
const result = await cleanTranscript({
+
transcriptId: "test-empty",
+
rawTranscript: "",
+
});
+
+
expect(result.cleanedTranscript).toBe("");
+
});
+
+
test("cleanTranscript falls back to raw transcript on API error", async () => {
+
const rawTranscript = "Test transcript";
+
+
// Test with missing API key (if it's actually set, this test might fail)
+
const originalKey = process.env.GEMINI_API_KEY;
+
delete process.env.GEMINI_API_KEY;
+
+
const result = await cleanTranscript({
+
transcriptId: "test-fallback",
+
rawTranscript,
+
});
+
+
expect(result.cleanedTranscript).toBe(rawTranscript);
+
expect(result.error).toBe("GEMINI_API_KEY not set");
+
+
// Restore key
+
if (originalKey) {
+
process.env.GEMINI_API_KEY = originalKey;
+
}
+
});
+127
src/lib/transcript-cleaner.ts
···
···
+
// Clean up transcripts using Gemini to remove tags and fix grammar
+
+
interface CleanTranscriptOptions {
+
transcriptId: string;
+
rawTranscript: string;
+
}
+
+
interface CleanTranscriptResult {
+
cleanedTranscript: string;
+
error?: string;
+
}
+
+
/**
+
* Clean transcript using Gemini Flash 2.0 (cheapest model)
+
* Removes tags like [SIDE CONVERSATION], [inaudible], etc.
+
* Fixes grammar while preserving sentence structure
+
*/
+
export async function cleanTranscript({
+
transcriptId,
+
rawTranscript,
+
}: CleanTranscriptOptions): Promise<CleanTranscriptResult> {
+
const apiKey = process.env.GEMINI_API_KEY;
+
+
if (!apiKey) {
+
return {
+
cleanedTranscript: rawTranscript,
+
error: "GEMINI_API_KEY not set",
+
};
+
}
+
+
// Skip cleaning if transcript is empty
+
if (!rawTranscript || rawTranscript.trim().length === 0) {
+
return {
+
cleanedTranscript: rawTranscript,
+
};
+
}
+
+
console.log(
+
`[TranscriptCleaner] Starting cleanup for ${transcriptId} (${rawTranscript.length} chars)`,
+
);
+
+
try {
+
const prompt = `You are a transcript editor. Clean up this transcript by:
+
1. Removing ALL tags like [SIDE CONVERSATION], [inaudible], [background chatter], etc.
+
2. Fixing grammar and punctuation to make sentences readable
+
3. Preserving the original sentence structure and wording as much as possible
+
4. Fixing obvious speech recognition errors (e.g., "gr..." should be "grade")
+
5. NOT adding any new content or changing the meaning
+
6. If there are obvious speaking mistakes then you can fix those (e.g. "we are going no wait sorry you should be doing")
+
+
Return ONLY the cleaned transcript text, nothing else.
+
+
Transcript to clean:
+
${rawTranscript}`;
+
+
const response = await fetch(
+
"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-exp:generateContent",
+
{
+
method: "POST",
+
headers: {
+
"Content-Type": "application/json",
+
"x-goog-api-key": apiKey,
+
},
+
body: JSON.stringify({
+
contents: [
+
{
+
parts: [{ text: prompt }],
+
},
+
],
+
generationConfig: {
+
temperature: 0.3,
+
topK: 40,
+
topP: 0.95,
+
maxOutputTokens: 8192,
+
},
+
}),
+
},
+
);
+
+
if (!response.ok) {
+
const errorText = await response.text();
+
console.error(
+
`[TranscriptCleaner] Gemini API error for ${transcriptId}:`,
+
errorText,
+
);
+
return {
+
cleanedTranscript: rawTranscript,
+
error: `Gemini API error: ${response.status}`,
+
};
+
}
+
+
const result = await response.json();
+
const cleanedText =
+
result.candidates?.[0]?.content?.parts?.[0]?.text?.trim();
+
+
if (!cleanedText) {
+
console.warn(
+
`[TranscriptCleaner] Empty response from Gemini for ${transcriptId}`,
+
);
+
return {
+
cleanedTranscript: rawTranscript,
+
error: "Empty response from Gemini",
+
};
+
}
+
+
const reduction = Math.round(
+
((rawTranscript.length - cleanedText.length) / rawTranscript.length) *
+
100,
+
);
+
console.log(
+
`[TranscriptCleaner] Completed for ${transcriptId}: ${rawTranscript.length} → ${cleanedText.length} chars (${reduction}% reduction)`,
+
);
+
+
return {
+
cleanedTranscript: cleanedText,
+
};
+
} catch (error) {
+
console.error(
+
`[TranscriptCleaner] Failed to clean ${transcriptId}:`,
+
error,
+
);
+
return {
+
cleanedTranscript: rawTranscript,
+
error: error instanceof Error ? error.message : "Unknown error",
+
};
+
}
+
}
+6 -25
src/lib/transcription.ts
···
import type { Database } from "bun:sqlite";
import { createEventSource } from "eventsource-client";
import { ErrorCode } from "./errors";
-
import { saveTranscript, saveTranscriptVTT } from "./transcript-storage";
// Constants
export const MAX_FILE_SIZE = 100 * 1024 * 1024; // 100MB
···
let transcript = update.transcript ?? "";
transcript = transcript.replace(/<\|[^|]+\|>/g, "").trim();
-
// Save transcript to file (overwrites on each update)
-
if (transcript) {
-
await saveTranscript(transcriptionId, transcript);
-
}
-
this.updateTranscription(transcriptionId, {
status,
progress,
···
transcript: transcript || undefined,
});
} else if (update.status === "completed") {
-
// Final transcript should already have tokens stripped by Murmur
-
const transcript = update.transcript ?? "";
-
-
// Save final transcript to file
-
if (transcript) {
-
await saveTranscript(transcriptionId, transcript);
-
}
-
// Fetch and save VTT file from Murmur
const whisperJobId = this.db
.query<{ whisper_job_id: string }, [string]>(
···
);
if (vttResponse.ok) {
const vttContent = await vttResponse.text();
-
await saveTranscriptVTT(transcriptionId, vttContent);
}
} catch (error) {
console.warn(
···
this.events.emit(transcriptionId, {
status: "completed",
progress: 100,
-
transcript,
});
// Close stream - keep audio file for playback
···
if (!details) return;
if (details.status === "completed") {
-
const transcript = details.transcript ?? "";
-
-
// Save transcript to file
-
if (transcript) {
-
await saveTranscript(transcriptionId, transcript);
-
}
-
// Fetch and save VTT file
try {
const vttResponse = await fetch(
···
);
if (vttResponse.ok) {
const vttContent = await vttResponse.text();
-
await saveTranscriptVTT(transcriptionId, vttContent);
}
} catch (error) {
console.warn(
···
this.events.emit(transcriptionId, {
status: "completed",
progress: 100,
-
transcript,
});
} else if (details.status === "failed") {
const errorMessage = (
···
import type { Database } from "bun:sqlite";
import { createEventSource } from "eventsource-client";
import { ErrorCode } from "./errors";
+
import { saveTranscriptVTT } from "./transcript-storage";
+
import { cleanVTT } from "./vtt-cleaner";
// Constants
export const MAX_FILE_SIZE = 100 * 1024 * 1024; // 100MB
···
let transcript = update.transcript ?? "";
transcript = transcript.replace(/<\|[^|]+\|>/g, "").trim();
this.updateTranscription(transcriptionId, {
status,
progress,
···
transcript: transcript || undefined,
});
} else if (update.status === "completed") {
// Fetch and save VTT file from Murmur
const whisperJobId = this.db
.query<{ whisper_job_id: string }, [string]>(
···
);
if (vttResponse.ok) {
const vttContent = await vttResponse.text();
+
const cleanedVTT = await cleanVTT(transcriptionId, vttContent);
+
await saveTranscriptVTT(transcriptionId, cleanedVTT);
}
} catch (error) {
console.warn(
···
this.events.emit(transcriptionId, {
status: "completed",
progress: 100,
});
// Close stream - keep audio file for playback
···
if (!details) return;
if (details.status === "completed") {
// Fetch and save VTT file
try {
const vttResponse = await fetch(
···
);
if (vttResponse.ok) {
const vttContent = await vttResponse.text();
+
const cleanedVTT = await cleanVTT(transcriptionId, vttContent);
+
await saveTranscriptVTT(transcriptionId, cleanedVTT);
}
} catch (error) {
console.warn(
···
this.events.emit(transcriptionId, {
status: "completed",
progress: 100,
});
} else if (details.status === "failed") {
const errorMessage = (
+38
src/lib/vtt-cleaner.test.ts
···
···
+
import { test, expect } from "bun:test";
+
import { cleanVTT } from "./vtt-cleaner";
+
+
const sampleVTT = `WEBVTT
+
+
00:00:00.000 --> 00:00:03.480
+
<|startoftranscript|> [SIDE CONVERSATION]<|endoftext|>
+
+
00:00:00.000 --> 00:00:00.000
+
<|startoftranscript|> Yes?
+
+
00:00:00.000 --> 00:00:00.000
+
So with this course packet, what quiz is and exams, and if I can study through here, what you talk about?
+
+
00:00:00.000 --> 00:00:00.000
+
And I give you a good review every time.
+
+
00:00:00.000 --> 00:00:00.000
+
Yeah, so I'd be good to just study that and then we can do it.`;
+
+
test("cleanVTT removes tags and cleans text", async () => {
+
const result = await cleanVTT("test-vtt", sampleVTT);
+
+
expect(result).toContain("WEBVTT");
+
expect(result).not.toContain("[SIDE CONVERSATION]");
+
expect(result).not.toContain("<|startoftranscript|>");
+
expect(result).not.toContain("<|endoftext|>");
+
expect(result).toContain("-->");
+
+
console.log("Cleaned VTT preview:", result.substring(0, 200));
+
}, 30000);
+
+
test("cleanVTT preserves empty VTT", async () => {
+
const emptyVTT = "WEBVTT\n\n";
+
const result = await cleanVTT("test-empty", emptyVTT);
+
+
expect(result).toBe(emptyVTT);
+
});
+125
src/lib/vtt-cleaner.ts
···
···
+
// Parse and clean VTT files
+
+
import { cleanTranscript } from "./transcript-cleaner";
+
+
interface VTTSegment {
+
index?: number;
+
timestamp: string;
+
text: string;
+
}
+
+
/**
+
* Parse VTT content into segments
+
*/
+
function parseVTT(vttContent: string): VTTSegment[] {
+
const lines = vttContent.split("\n");
+
const segments: VTTSegment[] = [];
+
let currentSegment: Partial<VTTSegment> = {};
+
+
for (let i = 0; i < lines.length; i++) {
+
const line = lines[i]?.trim();
+
+
if (!line) {
+
if (currentSegment.timestamp && currentSegment.text) {
+
segments.push(currentSegment as VTTSegment);
+
currentSegment = {};
+
}
+
continue;
+
}
+
+
if (line === "WEBVTT") {
+
continue;
+
}
+
+
// Check if it's a timestamp line
+
if (line.includes("-->")) {
+
currentSegment.timestamp = line;
+
// Next line(s) will be text
+
const textLines: string[] = [];
+
i++;
+
while (i < lines.length && lines[i]?.trim() && !lines[i]?.includes("-->")) {
+
textLines.push(lines[i] || "");
+
i++;
+
}
+
currentSegment.text = textLines.join("\n").trim();
+
i--; // Back up one since the loop will increment
+
} else if (/^\d+$/.test(line)) {
+
// It's an index number
+
currentSegment.index = Number.parseInt(line, 10);
+
}
+
}
+
+
// Add last segment if exists
+
if (currentSegment.timestamp && currentSegment.text) {
+
segments.push(currentSegment as VTTSegment);
+
}
+
+
return segments;
+
}
+
+
/**
+
* Clean VTT text segments by removing tags and fixing grammar
+
*/
+
export async function cleanVTT(
+
transcriptionId: string,
+
vttContent: string,
+
): Promise<string> {
+
const segments = parseVTT(vttContent);
+
+
if (segments.length === 0) {
+
return vttContent;
+
}
+
+
console.log(
+
`[VTTCleaner] Cleaning ${segments.length} segments for ${transcriptionId}`,
+
);
+
+
// Combine all text for cleaning
+
const allText = segments.map((s) => s.text).join(" ");
+
+
const { cleanedTranscript, error } = await cleanTranscript({
+
transcriptId: transcriptionId,
+
rawTranscript: allText,
+
});
+
+
if (error) {
+
console.warn(`[VTTCleaner] Falling back to original VTT: ${error}`);
+
return vttContent;
+
}
+
+
// Split cleaned text back into segments
+
// Use simple word-based splitting proportional to original segment lengths
+
const words = cleanedTranscript.split(/\s+/);
+
const originalWords = allText.split(/\s+/);
+
const ratio = words.length / originalWords.length;
+
+
let wordIndex = 0;
+
const cleanedSegments: VTTSegment[] = [];
+
+
for (const segment of segments) {
+
const originalWordCount = segment.text.split(/\s+/).length;
+
const newWordCount = Math.max(1, Math.round(originalWordCount * ratio));
+
const segmentWords = words.slice(wordIndex, wordIndex + newWordCount);
+
wordIndex += newWordCount;
+
+
cleanedSegments.push({
+
timestamp: segment.timestamp,
+
text: segmentWords.join(" "),
+
index: segment.index,
+
});
+
}
+
+
// Rebuild VTT
+
let output = "WEBVTT\n\n";
+
for (const segment of cleanedSegments) {
+
if (segment.index !== undefined) {
+
output += `${segment.index}\n`;
+
}
+
output += `${segment.timestamp}\n`;
+
output += `${segment.text}\n\n`;
+
}
+
+
console.log(`[VTTCleaner] Completed for ${transcriptionId}`);
+
+
return output;
+
}