🪻 distributed transcription service thistle.dunkirk.sh

feat: store transcripts as files

dunkirk.sh 8a7efe9e 49ca65d3

verified
+1
.gitignore
···
node_modules
*.db
uploads/
+
transcripts/
.env
+92
CRUSH.md
···
Use `bun test` to run tests.
+
### Basic Test Structure
+
```ts
import { test, expect } from "bun:test";
···
expect(1).toBe(1);
});
```
+
+
### Test File Naming
+
+
- Place tests next to the code they test: `foo.ts` → `foo.test.ts`
+
- This keeps tests close to implementation for easy maintenance
+
- Bun automatically discovers `*.test.ts` files
+
+
### Writing Good Tests
+
+
**Test security-critical code:**
+
- File path operations (directory traversal, injection)
+
- User input validation
+
- Authentication/authorization
+
- API endpoint security
+
+
**Test edge cases:**
+
- Empty strings, null, undefined
+
- Very large inputs (size limits)
+
- Invalid formats
+
- Boundary conditions
+
+
**Test async operations:**
+
```ts
+
test("async function", async () => {
+
const result = await someAsyncFunction();
+
expect(result).toBe("expected value");
+
});
+
```
+
+
**Test error conditions:**
+
```ts
+
test("rejects invalid input", async () => {
+
await expect(dangerousFunction("../../../etc/passwd")).rejects.toThrow();
+
await expect(dangerousFunction("invalid")).rejects.toThrow("Invalid format");
+
});
+
```
+
+
**Example: Security-focused tests**
+
```ts
+
test("prevents directory traversal", async () => {
+
const maliciousIds = [
+
"../../../etc/passwd",
+
"../../secret.txt",
+
"test/../../../config",
+
];
+
+
for (const id of maliciousIds) {
+
await expect(loadFile(id)).rejects.toThrow();
+
}
+
});
+
+
test("validates input format", async () => {
+
const invalidInputs = [
+
"test; rm -rf /",
+
"test`whoami`",
+
"test\x00null",
+
];
+
+
for (const input of invalidInputs) {
+
await expect(processInput(input)).rejects.toThrow("Invalid format");
+
}
+
});
+
```
+
+
### Running Tests
+
+
```bash
+
# Run all tests
+
bun test
+
+
# Run specific test file
+
bun test src/lib/auth.test.ts
+
+
# Watch mode (re-run on changes)
+
bun test --watch
+
```
+
+
### What to Test
+
+
**Always test:**
+
- Security-critical functions (file I/O, user input)
+
- Complex business logic
+
- Edge cases and error handling
+
- Public API functions
+
+
**Don't need to test:**
+
- Simple getters/setters
+
- Framework/library code
+
- UI components (unless complex logic)
+
- One-line utility functions
## TypeScript Configuration
+8
src/db/schema.ts
···
CREATE INDEX IF NOT EXISTS idx_transcriptions_whisper_job_id ON transcriptions(whisper_job_id);
`,
},
+
{
+
version: 4,
+
name: "Remove transcript column from transcriptions",
+
sql: `
+
-- SQLite 3.35.0+ supports DROP COLUMN
+
ALTER TABLE transcriptions DROP COLUMN transcript;
+
`,
+
},
];
function getCurrentVersion(): number {
+34 -20
src/index.ts
···
type TranscriptionUpdate,
WhisperServiceManager,
} from "./lib/transcription";
+
import { getTranscript } from "./lib/transcript-storage";
import indexHTML from "./pages/index.html";
import settingsHTML from "./pages/settings.html";
import transcribeHTML from "./pages/transcribe.html";
···
const WHISPER_SERVICE_URL =
process.env.WHISPER_SERVICE_URL || "http://localhost:8000";
-
// Create uploads directory if it doesn't exist
+
// Create uploads and transcripts directories if they don't exist
await Bun.write("./uploads/.gitkeep", "");
+
await Bun.write("./transcripts/.gitkeep", "");
// Initialize transcription system
console.log(
···
},
},
"/api/transcriptions/:id/stream": {
-
GET: (req) => {
+
GET: async (req) => {
const sessionId = getSessionFromRequest(req);
if (!sessionId) {
return Response.json({ error: "Not authenticated" }, { status: 401 });
···
}
// Event-driven SSE stream with reconnection support
const stream = new ReadableStream({
-
start(controller) {
+
async start(controller) {
const encoder = new TextEncoder();
let isClosed = false;
let lastEventId = Math.floor(Date.now() / 1000);
···
isClosed = true;
}
};
-
// Send initial state from DB
+
// Send initial state from DB and file
const current = db
.query<
{
status: string;
progress: number;
-
transcript: string | null;
},
[string]
>(
-
"SELECT status, progress, transcript FROM transcriptions WHERE id = ?",
+
"SELECT status, progress FROM transcriptions WHERE id = ?",
)
.get(transcriptionId);
if (current) {
+
// Load transcript from file if completed
+
let transcript: string | undefined;
+
if (current.status === "completed") {
+
transcript = (await getTranscript(transcriptionId)) || undefined;
+
}
sendEvent({
status: current.status as TranscriptionUpdate["status"],
progress: current.progress,
-
transcript: current.transcript || undefined,
+
transcript,
});
}
// If already complete, close immediately
···
},
},
"/api/transcriptions": {
-
GET: (req) => {
+
GET: async (req) => {
try {
const user = requireAuth(req);
···
original_filename: string;
status: string;
progress: number;
-
transcript: string | null;
created_at: number;
},
[number]
>(
-
"SELECT id, filename, original_filename, status, progress, transcript, created_at FROM transcriptions WHERE user_id = ? ORDER BY created_at DESC",
+
"SELECT id, filename, original_filename, status, progress, created_at FROM transcriptions WHERE user_id = ? ORDER BY created_at DESC",
)
.all(user.id);
-
return Response.json({
-
jobs: transcriptions.map((t) => ({
-
id: t.id,
-
filename: t.original_filename,
-
status: t.status,
-
progress: t.progress,
-
transcript: t.transcript,
-
created_at: t.created_at,
-
})),
-
});
+
// Load transcripts from files for completed jobs
+
const jobs = await Promise.all(
+
transcriptions.map(async (t) => {
+
let transcript: string | null = null;
+
if (t.status === "completed") {
+
transcript = await getTranscript(t.id);
+
}
+
return {
+
id: t.id,
+
filename: t.original_filename,
+
status: t.status,
+
progress: t.progress,
+
transcript,
+
created_at: t.created_at,
+
};
+
}),
+
);
+
+
return Response.json({ jobs });
} catch (error) {
return handleError(error);
}
+1 -1
src/lib/auth.ts
···
password: string,
): Promise<User | null> {
const result = db
-
.query<{ id: number; email: string; name: string | null; password_hash: string; created_at: number }, [string]>(
+
.query<{ id: number; email: string; name: string | null; avatar: string; password_hash: string; created_at: number }, [string]>(
"SELECT id, email, name, avatar, password_hash, created_at FROM users WHERE email = ?",
)
.get(email);
+93
src/lib/transcript-storage.test.ts
···
+
import { expect, test } from "bun:test";
+
import {
+
deleteTranscript,
+
getTranscript,
+
hasTranscript,
+
saveTranscript,
+
} from "./transcript-storage";
+
+
test("transcript storage", async () => {
+
const testId = "test-transcript-123";
+
const testContent = "This is a test transcript with some content.";
+
+
// Should not exist initially
+
expect(await hasTranscript(testId)).toBe(false);
+
expect(await getTranscript(testId)).toBe(null);
+
+
// Save transcript
+
await saveTranscript(testId, testContent);
+
+
// Should exist now
+
expect(await hasTranscript(testId)).toBe(true);
+
expect(await getTranscript(testId)).toBe(testContent);
+
+
// Delete transcript
+
await deleteTranscript(testId);
+
+
// Should not exist anymore
+
expect(await hasTranscript(testId)).toBe(false);
+
expect(await getTranscript(testId)).toBe(null);
+
});
+
+
test("transcript storage handles large content", async () => {
+
const testId = "test-large-transcript";
+
// Create a 1MB transcript
+
const largeContent = "A".repeat(1024 * 1024);
+
+
await saveTranscript(testId, largeContent);
+
const retrieved = await getTranscript(testId);
+
+
expect(retrieved).toBe(largeContent);
+
expect(retrieved?.length).toBe(1024 * 1024);
+
+
await deleteTranscript(testId);
+
});
+
+
test("transcript storage prevents directory traversal", async () => {
+
const maliciousIds = [
+
"../../../etc/passwd",
+
"../../secret.txt",
+
"../config",
+
"test/../../../etc/passwd",
+
"test/../../passwd",
+
];
+
+
for (const id of maliciousIds) {
+
await expect(saveTranscript(id, "malicious")).rejects.toThrow();
+
await expect(getTranscript(id)).rejects.toThrow();
+
await expect(deleteTranscript(id)).rejects.toThrow();
+
await expect(hasTranscript(id)).rejects.toThrow();
+
}
+
});
+
+
test("transcript storage validates ID format", async () => {
+
const invalidIds = [
+
"test; rm -rf /",
+
"test`whoami`",
+
"test\x00null",
+
"test\nls",
+
"test&&ls",
+
];
+
+
for (const id of invalidIds) {
+
await expect(saveTranscript(id, "test")).rejects.toThrow(
+
"Invalid transcription ID",
+
);
+
}
+
});
+
+
test("transcript storage accepts valid UUIDs", async () => {
+
const validIds = [
+
"550e8400-e29b-41d4-a716-446655440000",
+
"6ba7b810-9dad-11d1-80b4-00c04fd430c8",
+
"test-abc-123",
+
"abc123",
+
];
+
+
for (const id of validIds) {
+
await saveTranscript(id, "test");
+
expect(await getTranscript(id)).toBe("test");
+
await deleteTranscript(id);
+
}
+
});
+
+75
src/lib/transcript-storage.ts
···
+
// File-based transcript storage to avoid SQLite size limits
+
+
import { unlinkSync } from "node:fs";
+
import { basename } from "node:path";
+
+
const TRANSCRIPTS_DIR = "./transcripts";
+
+
/**
+
* Validate and sanitize transcription ID to prevent directory traversal
+
*/
+
function validateTranscriptionId(id: string): string {
+
// Reject empty strings
+
if (!id || id.length === 0) {
+
throw new Error("Invalid transcription ID: empty");
+
}
+
// Only allow safe characters (alphanumeric, hyphens, underscores)
+
if (!/^[a-zA-Z0-9_-]+$/.test(id)) {
+
throw new Error("Invalid transcription ID format");
+
}
+
// Ensure no path traversal by using only the basename
+
const safeName = basename(id);
+
if (safeName !== id) {
+
throw new Error("Invalid transcription ID: path traversal detected");
+
}
+
return safeName;
+
}
+
+
/**
+
* Write transcript to file system
+
*/
+
export async function saveTranscript(
+
transcriptionId: string,
+
transcript: string,
+
): Promise<void> {
+
const safeId = validateTranscriptionId(transcriptionId);
+
const filePath = `${TRANSCRIPTS_DIR}/${safeId}.txt`;
+
await Bun.write(filePath, transcript);
+
}
+
+
/**
+
* Read transcript from file system
+
*/
+
export async function getTranscript(
+
transcriptionId: string,
+
): Promise<string | null> {
+
const safeId = validateTranscriptionId(transcriptionId);
+
const filePath = `${TRANSCRIPTS_DIR}/${safeId}.txt`;
+
try {
+
return await Bun.file(filePath).text();
+
} catch {
+
return null;
+
}
+
}
+
+
/**
+
* Delete transcript file
+
*/
+
export async function deleteTranscript(transcriptionId: string): Promise<void> {
+
const safeId = validateTranscriptionId(transcriptionId);
+
const filePath = `${TRANSCRIPTS_DIR}/${safeId}.txt`;
+
try {
+
unlinkSync(filePath);
+
} catch {
+
// File doesn't exist or already deleted
+
}
+
}
+
+
/**
+
* Check if transcript exists
+
*/
+
export async function hasTranscript(transcriptionId: string): Promise<boolean> {
+
const safeId = validateTranscriptionId(transcriptionId);
+
const filePath = `${TRANSCRIPTS_DIR}/${safeId}.txt`;
+
return await Bun.file(filePath).exists();
+
}
+19 -16
src/lib/transcription.ts
···
import type { Database } from "bun:sqlite";
import { createEventSource } from "eventsource-client";
import { ErrorCode } from "./errors";
+
import { saveTranscript } from "./transcript-storage";
// Constants
export const MAX_FILE_SIZE = 100 * 1024 * 1024; // 100MB
-
export const MAX_TRANSCRIPT_LENGTH = 50000;
export const MAX_ERROR_LENGTH = 255;
// Types
···
this.activeStreams.set(transcriptionId, es);
}
-
private handleWhisperUpdate(
+
private async handleWhisperUpdate(
transcriptionId: string,
filePath: string,
update: WhisperJob,
···
let transcript = update.transcript ?? "";
transcript = transcript.replace(/<\|[^|]+\|>/g, "").trim();
+
// Save transcript to file (overwrites on each update)
+
if (transcript) {
+
await saveTranscript(transcriptionId, transcript);
+
}
+
this.updateTranscription(transcriptionId, {
status,
progress,
-
transcript,
});
this.events.emit(transcriptionId, {
···
});
} else if (update.status === "completed") {
// Final transcript should already have tokens stripped by Murmur
-
const transcript = (update.transcript ?? "").substring(
-
0,
-
MAX_TRANSCRIPT_LENGTH,
-
);
+
const transcript = update.transcript ?? "";
+
+
// Save final transcript to file
+
if (transcript) {
+
await saveTranscript(transcriptionId, transcript);
+
}
this.updateTranscription(transcriptionId, {
status: "completed",
progress: 100,
-
transcript,
});
this.events.emit(transcriptionId, {
···
data: {
status?: TranscriptionStatus;
progress?: number;
-
transcript?: string;
error_message?: string;
},
) {
···
if (data.progress !== undefined) {
updates.push("progress = ?");
values.push(data.progress);
-
}
-
if (data.transcript !== undefined) {
-
updates.push("transcript = ?");
-
values.push(data.transcript);
}
if (data.error_message !== undefined) {
updates.push("error_message = ?");
···
if (!details) return;
if (details.status === "completed") {
-
const transcript =
-
details.transcript?.substring(0, MAX_TRANSCRIPT_LENGTH) ?? "";
+
const transcript = details.transcript ?? "";
+
+
// Save transcript to file
+
if (transcript) {
+
await saveTranscript(transcriptionId, transcript);
+
}
this.updateTranscription(transcriptionId, {
status: "completed",
progress: 100,
-
transcript,
});
this.events.emit(transcriptionId, {