commit 12c8fdc6eafcb397bdedd4be0ec2ceb514e7d8ea · dunkirk.sh/thistle

-96

src/lib/transcript-cleaner.test.ts

···

       1
       -
       import { test, expect } from "bun:test";

     

       2
       -
       import { cleanAndGetParagraphBoundaries } from "./transcript-cleaner";

     

       3
       -
       

     

       4
       -
       // AI integration test - skip by default to avoid burning credits

     

       5
       -
       // Run with: bun test src/lib/transcript-cleaner.test.ts --test-name-pattern "AI"

     

       6
       -
       test.skip("cleanAndGetParagraphBoundaries cleans transcript and returns paragraph boundaries", async () => {

     

       7
       -
       // Use a longer, more realistic transcript sample with natural paragraph breaks

     

       8
       -
       const rawTranscript = `[SIDE CONVERSATION] Today in chapel we are talking about the fact that we believe in having gospel conversations. I'm gonna run my own PowerPoint. I'm gonna jump around. It's gonna be a little more conversational than normal. It's not gonna be like one of the normal sermons, although I know me and my tendency it'll turn into a sermon at some point just because that's the way God made me, so I can't help it.

     

       9
       -
       

     

       10
       -
       Alright, so when it starts just have fun with it. We'll go on. Here's what it says in our doctrinal statement. It says, "Due to the commission of Christ and the urgency of the Gospel, all believers are to engage in Gospel conversations." How many of you believe that? That's pretty weak. How many of you believe that?

     

       11
       -
       

     

       12
       -
       To live God-honoring lives and to work continuously for the spread of the Gospel to their neighbors and the nations. Now, let's be honest, as we start off this morning, all of us could do a better job with personal evangelism, and all of us could do a better job with a heart for missions.

     

       13
       -
       

     

       14
       -
       So I'm not up here talking to you about something I have conquered or mastered. I'm not the expert on this. In fact, when it comes to personal evangelism in my own strength, I'm often a complete failure. But I have found that even in my weakness, God can use me in powerful ways when I make myself available to Him.`;

     

       15
       -
       

     

       16
       -
       // Create mock segments from raw transcript (simulating whisper output)

     

       17
       -
       const sentences = rawTranscript.split(/\.\s+/);

     

       18
       -
       const mockSegments: { index?: number; start?: number; end?: number; text: string }[] = [];

     

       19
       -
       let timeOffset = 0;

     

       20
       -
       for (let i = 0; i < sentences.length; i++) {

     

       21
       -
       const sentence = sentences[i]?.trim();

     

       22
       -
       if (!sentence) continue;

     

       23
       -
       const duration = sentence.split(/\s+/).length * 0.3; // ~0.3s per word

     

       24
       -
       mockSegments.push({

     

       25
       -
       index: i,

     

       26
       -
       start: timeOffset,

     

       27
       -
       end: timeOffset + duration,

     

       28
       -
       text: sentence,

     

       29
       -
       });

     

       30
       -
       timeOffset += duration;

     

       31
       -
       }

     

       32
       -
       

     

       33
       -
       const result = await cleanAndGetParagraphBoundaries({

     

       34
       -
       transcriptId: "test-123",

     

       35
       -
       rawTranscript,

     

       36
       -
       segments: mockSegments,

     

       37
       -
       maxWordsMove: 3,

     

       38
       -
       });

     

       39
       -
       

     

       40
       -
       // Check that we got a result

     

       41
       -
       expect(result.paragraphs).toBeDefined();

     

       42
       -
       expect(result.paragraphs!.length).toBeGreaterThan(1); // Should have multiple paragraphs

     

       43
       -
       

     

       44
       -
       // Check that paragraphs have the expected structure

     

       45
       -
       for (const para of result.paragraphs!) {

     

       46
       -
       		expect(para).toHaveProperty('startSegmentIndex');

     

       47
       -
        expect(para).toHaveProperty('endSegmentIndex');

     

       48
       -
        expect(para).toHaveProperty('text');

     

       49
       -
        expect(para.text.length).toBeGreaterThan(0);

     

       50
       -
       }

     

       51
       -
       

     

       52
       -
       // The cleaned text should have tags removed

     

       53
       -
       const cleanedText = result.paragraphs!.map(p => p.text).join(' ');

     

       54
       -
       

     

       55
       -
       expect(cleanedText).not.toContain("[SIDE CONVERSATION]");

     

       56
       -
       expect(cleanedText.toLowerCase()).toContain("gospel");

     

       57
       -
       expect(cleanedText.toLowerCase()).toContain("evangelism");

     

       58
       -
       

     

       59
       -
       	console.log(`Detected ${result.paragraphs!.length} paragraphs from ${mockSegments.length} segments`);

     

       60
       -
       	console.log("First paragraph:", result.paragraphs![0]?.text.substring(0, 100) + "...");

     

       61
       -
       	console.log("Last paragraph:", result.paragraphs![result.paragraphs!.length - 1]?.text.substring(0, 100) + "...");

     

       62
       -
       }, 30000); // 30s timeout for API call

     

       63
       -
       

     

       64
       -
       test("cleanAndGetParagraphBoundaries handles empty transcript", async () => {

     

       65
       -
       const result = await cleanAndGetParagraphBoundaries({

     

       66
       -
       transcriptId: "test-empty",

     

       67
       -
       rawTranscript: "",

     

       68
       -
       segments: [],

     

       69
       -
       maxWordsMove: 3,

     

       70
       -
       });

     

       71
       -
       

     

       72
       -
       expect(result.paragraphs).toEqual([]);

     

       73
       -
       });

     

       74
       -
       

     

       75
       -
       test("cleanAndGetParagraphBoundaries returns error on missing API key", async () => {

     

       76
       -
       const rawTranscript = "Test transcript";

     

       77
       -
       

     

       78
       -
       // Test with missing API key (if it's actually set, this test might fail)

     

       79
       -
       const originalKey = process.env.OPENROUTER_API_KEY;

     

       80
       -
       delete process.env.OPENROUTER_API_KEY;

     

       81
       -
       

     

       82
       -
       const result = await cleanAndGetParagraphBoundaries({

     

       83
       -
       transcriptId: "test-fallback",

     

       84
       -
       rawTranscript,

     

       85
       -
       segments: [{ text: rawTranscript }],

     

       86
       -
       maxWordsMove: 3,

     

       87
       -
       });

     

       88
       -
       

     

       89
       -
       expect(result.paragraphs).toBeUndefined();

     

       90
       -
       expect(result.error).toBe("OPENROUTER_API_KEY not set");

     

       91
       -
       

     

       92
       -
       // Restore key

     

       93
       -
       if (originalKey) {

     

       94
       -
       process.env.OPENROUTER_API_KEY = originalKey;

     

       95
       -
       }

     

       96
       -
       });

-134

src/lib/transcript-cleaner.ts

···

       1
       -
       // Paragraph boundary detection using OpenRouter. Returns a JSON array of paragraph objects.

     

       2
       -
       export interface ParagraphBoundary {

     

       3
       -
       	startSegmentIndex: number;

     

       4
       -
       	endSegmentIndex: number;

     

       5
       -
       	text: string;

     

       6
       -
       	// Optional: list of moved words for auditing

     

       7
       -
       	movedWords?: { word: string; fromSegmentIndex: number; toSegmentIndex: number }[];

     

       8
       -
       }

     

       9
       -
       

     

       10
       -
       // Cleans transcript and determines paragraph boundaries in one LLM request.

     

       11
       -
       // Returns paragraph boundaries as JSON array.

     

       12
       -
       export async function cleanAndGetParagraphBoundaries({

     

       13
       -
       	transcriptId,

     

       14
       -
       	rawTranscript,

     

       15
       -
       	segments,

     

       16
       -
       	maxWordsMove = 0,

     

       17
       -
       }: {

     

       18
       -
       	transcriptId: string;

     

       19
       -
       	rawTranscript: string;

     

       20
       -
       	segments: { index?: number; start?: number; end?: number; text: string }[];

     

       21
       -
       	maxWordsMove?: number;

     

       22
       -
       }): Promise<{ paragraphs?: ParagraphBoundary[]; error?: string }> {

     

       23
       -
       	// Skip processing if transcript is empty

     

       24
       -
       	if (!rawTranscript || rawTranscript.trim().length === 0) {

     

       25
       -
       		return { paragraphs: [] };

     

       26
       -
       	}

     

       27
       -
       

     

       28
       -
       	const apiKey = process.env.OPENROUTER_API_KEY;

     

       29
       -
       	const model = process.env.OPENROUTER_MODEL || "openrouter/polaris-alpha";

     

       30
       -
       	if (!apiKey) {

     

       31
       -
       		return { error: "OPENROUTER_API_KEY not set" };

     

       32
       -
       	}

     

       33
       -
       

     

       34
       -
       	try {

     

       35
       -
       		const segmentsPayload = segments.map((s) => ({

     

       36
       -
       			index: s.index ?? null,

     

       37
       -
       			start: s.start ?? null,

     

       38
       -
       			end: s.end ?? null,

     

       39
       -
       			text: s.text ?? "",

     

       40
       -
       		}));

     

       41
       -
       

     

       42
       -
       		const prompt = `You are a transcript editor and paragrapher. Input: a list of original transcript segments with their index, start time (seconds), end time (seconds), and the RAW transcript text.

     

       43
       -
       

     

       44
       -
       Your task: First, clean the transcript by:

     

       45
       -
       1. Removing ALL tags like [SIDE CONVERSATION], [inaudible], [background chatter], etc.

     

       46
       -
       2. Fixing grammar and punctuation to make sentences readable

     

       47
       -
       3. Preserving the original sentence structure and wording as much as possible

     

       48
       -
       4. Fixing obvious speech recognition errors (e.g., "gr..." should be "grade")

     

       49
       -
       5. NOT adding any new content or changing the meaning

     

       50
       -
       6. If there are obvious speaking mistakes then you can fix those (e.g. "we are going no wait sorry you should be doing")

     

       51
       -
       

     

       52
       -
       Then, determine paragraph boundaries by grouping the cleaned segments into logical paragraphs. A paragraph represents a complete thought, topic, or idea. Create MULTIPLE paragraphs based on:

     

       53
       -
       - Natural topic changes or shifts in the speaker's focus

     

       54
       -
       - Pauses or transitions in the speech ("Now...", "So...", "Let me tell you...", "Alright...")

     

       55
       -
       - Complete narrative beats or examples

     

       56
       -
       - Typical spoken paragraph length (30-120 seconds / 5-20 segments)

     

       57
       -
       

     

       58
       -
       CRITICAL: Each paragraph MUST end with a complete sentence. DO NOT break paragraphs mid-sentence.

     

       59
       -
       

     

       60
       -
       RETURN ONLY a JSON array of objects, EXACTLY in this format (no additional text):

     

       61
       -
       

     

       62
       -
       [ {"startSegmentIndex": <int>, "endSegmentIndex": <int>, "text": "<paragraph text>"}, ... ]

     

       63
       -
       

     

       64
       -
       Rules for paragraphing:

     

       65
       -
       - ALWAYS end paragraphs at sentence boundaries (after periods, question marks, or exclamation points)

     

       66
       -
       - NEVER break a paragraph in the middle of a sentence

     

       67
       -
       - Create AT LEAST one paragraph for every 30-60 seconds of speech (roughly 5-10 segments)

     

       68
       -
       - DO NOT put the entire transcript in a single paragraph

     

       69
       -
       - Paragraphs must reference original segment indexes

     

       70
       -
       - Do not move words across segment boundaries

     

       71
       -
       - Return the paragraphs in order and cover the entire cleaned transcript text without overlap or omission

     

       72
       -
       

     

       73
       -
       Segments:

     

       74
       -
       ${JSON.stringify(segmentsPayload, null, 2)}

     

       75
       -
       

     

       76
       -
       Raw Transcript:

     

       77
       -
       ${rawTranscript}`;

     

       78
       -
       

     

       79
       -
       		const response = await fetch(

     

       80
       -
       			"https://openrouter.ai/api/v1/chat/completions",

     

       81
       -
       			{

     

       82
       -
       				method: "POST",

     

       83
       -
       				headers: {

     

       84
       -
       					"Content-Type": "application/json",

     

       85
       -
       					"Authorization": `Bearer ${apiKey}`,

     

       86
       -
       					"HTTP-Referer": "https://thistle.app",

     

       87
       -
       					"X-Title": "Thistle Transcription",

     

       88
       -
       				},

     

       89
       -
       				body: JSON.stringify({

     

       90
       -
       					model,

     

       91
       -
       					messages: [

     

       92
       -
       						{ role: "user", content: prompt },

     

       93
       -
       					],

     

       94
       -
       					temperature: 0.0,

     

       95
       -
       					max_tokens: 8192,

     

       96
       -
       				}),

     

       97
       -
       			},

     

       98
       -
       		);

     

       99
       -
       

     

       100
       -
       		if (!response.ok) {

     

       101
       -
       			const errorText = await response.text();

     

       102
       -
       			console.error(`[Paragrapher] OpenRouter error for ${transcriptId}:`, errorText);

     

       103
       -
       			return { error: `OpenRouter API error: ${response.status}` };

     

       104
       -
       		}

     

       105
       -
       

     

       106
       -
       		const result = await response.json();

     

       107
       -
       		const raw = result.choices?.[0]?.message?.content?.trim();

     

       108
       -
       		if (!raw) {

     

       109
       -
       			return { error: "Empty paragrapher response" };

     

       110
       -
       		}

     

       111
       -
       

     

       112
       -
       		let parsed: ParagraphBoundary[] | null = null;

     

       113
       -
       		try {

     

       114
       -
       			parsed = JSON.parse(raw) as ParagraphBoundary[];

     

       115
       -
       		} catch (e) {

     

       116
       -
       			// Attempt to extract JSON substring if model padded text

     

       117
       -
       			const firstBracket = raw.indexOf("[");

     

       118
       -
       			const lastBracket = raw.lastIndexOf("]");

     

       119
       -
       			if (firstBracket >= 0 && lastBracket > firstBracket) {

     

       120
       -
       				const substr = raw.substring(firstBracket, lastBracket + 1);

     

       121
       -
       				parsed = JSON.parse(substr) as ParagraphBoundary[];

     

       122
       -
       			}

     

       123
       -
       		}

     

       124
       -
       

     

       125
       -
       		if (!parsed || !Array.isArray(parsed)) {

     

       126
       -
       			return { error: "Failed to parse paragrapher JSON" };

     

       127
       -
       		}

     

       128
       -
       

     

       129
       -
       		return { paragraphs: parsed };

     

       130
       -
       	} catch (err) {

     

       131
       -
       		console.error("[Paragrapher] Exception:", err);

     

       132
       -
       		return { error: err instanceof Error ? err.message : "Unknown error" };

     

       133
       -
       	}

     

       134
       -
       }