this repo has no description
1#!/usr/bin/env bun
2// Evaluate classifier performance against labeled test data
3
4import { readFile } from "fs/promises";
5import { EmailClassifier } from "./classifier.ts";
6import type { LabeledEmail, TestCase, TestResult, ClassificationResult } from "./types.ts";
7
8interface LabeledData {
9 source_file: string;
10 labeled_at: string;
11 total_count: number;
12 labeled_count: number;
13 emails: LabeledEmail[];
14}
15
16async function evaluate() {
17 console.log("📊 Evaluating Email Classifier\n");
18
19 // Load labeled data
20 const args = process.argv.slice(2);
21 const labeledFile = args[0] || "data/labeled-emails.json";
22 const data: LabeledData = JSON.parse(await readFile(labeledFile, "utf-8"));
23
24 // Filter to only labeled emails
25 const labeled = data.emails.filter(e => e.pertains !== undefined);
26
27 console.log(`Loaded ${labeled.length} labeled emails`);
28 console.log(` Relevant: ${labeled.filter(e => e.pertains).length}`);
29 console.log(` Not relevant: ${labeled.filter(e => !e.pertains).length}\n`);
30
31 // Create test cases
32 const testCases: TestCase[] = labeled.map(e => ({
33 input: {
34 subject: e.subject,
35 from: e.from,
36 to: e.to,
37 cc: e.cc,
38 body: e.body,
39 date: e.date
40 },
41 expected: {
42 pertains: e.pertains!,
43 reason: e.reason || ""
44 },
45 metadata: {
46 thread_id: e.thread_id,
47 date: e.date,
48 confidence: e.confidence || "unknown",
49 notes: e.notes,
50 labels: e.labels
51 }
52 }));
53
54 // Run classifier on all test cases
55 const classifier = new EmailClassifier();
56 const results: TestResult = {
57 total: testCases.length,
58 correct: 0,
59 incorrect: 0,
60 accuracy: 0,
61 false_positives: 0,
62 false_negatives: 0,
63 precision: 0,
64 recall: 0,
65 f1_score: 0,
66 failures: []
67 };
68
69 let truePositives = 0;
70 let trueNegatives = 0;
71
72 for (const testCase of testCases) {
73 const actual = classifier.classify(testCase.input);
74 const expected = testCase.expected.pertains;
75 const predicted = actual.pertains;
76
77 if (predicted === expected) {
78 results.correct++;
79 if (predicted) truePositives++;
80 else trueNegatives++;
81 } else {
82 results.incorrect++;
83 results.failures.push({ test_case: testCase, actual });
84
85 if (predicted && !expected) {
86 results.false_positives++; // Said relevant when not
87 } else {
88 results.false_negatives++; // Said not relevant when is
89 }
90 }
91 }
92
93 // Calculate metrics
94 results.accuracy = results.correct / results.total;
95
96 const totalPredictedPositive = truePositives + results.false_positives;
97 const totalActualPositive = truePositives + results.false_negatives;
98
99 results.precision = totalPredictedPositive > 0 ? truePositives / totalPredictedPositive : 0;
100 results.recall = totalActualPositive > 0 ? truePositives / totalActualPositive : 0;
101 results.f1_score = (results.precision + results.recall) > 0
102 ? 2 * (results.precision * results.recall) / (results.precision + results.recall)
103 : 0;
104
105 // Print results
106 console.log("═".repeat(80));
107 console.log("EVALUATION RESULTS");
108 console.log("═".repeat(80));
109 console.log(`Total test cases: ${results.total}`);
110 console.log(`Correct: ${results.correct} (${(results.accuracy * 100).toFixed(1)}%)`);
111 console.log(`Incorrect: ${results.incorrect}`);
112 console.log(` False positives: ${results.false_positives} (said relevant when not)`);
113 console.log(` False negatives: ${results.false_negatives} (said not relevant when is)`);
114 console.log();
115 console.log(`Accuracy: ${(results.accuracy * 100).toFixed(1)}%`);
116 console.log(`Precision: ${(results.precision * 100).toFixed(1)}% (of predicted relevant, % correct)`);
117 console.log(`Recall: ${(results.recall * 100).toFixed(1)}% (of actual relevant, % found)`);
118 console.log(`F1 Score: ${(results.f1_score * 100).toFixed(1)}%`);
119 console.log("═".repeat(80));
120
121 // Show failures
122 if (results.failures.length > 0) {
123 console.log("\n❌ FAILURES:\n");
124 for (let i = 0; i < results.failures.length; i++) {
125 const failure = results.failures[i];
126 const expected = failure.test_case.expected.pertains;
127 const actual = failure.actual.pertains;
128
129 console.log(`${i + 1}. ${actual ? "FALSE POSITIVE" : "FALSE NEGATIVE"}`);
130 console.log(` Subject: ${failure.test_case.input.subject}`);
131 console.log(` From: ${failure.test_case.input.from}`);
132 console.log(` Expected: ${expected ? "RELEVANT" : "NOT RELEVANT"} (${failure.test_case.expected.reason})`);
133 console.log(` Got: ${actual ? "RELEVANT" : "NOT RELEVANT"} (${failure.actual.reason})`);
134 console.log(` Confidence: ${(failure.actual.confidence * 100).toFixed(0)}%`);
135 console.log(` Rules: ${failure.actual.matched_rules?.join(", ") || "none"}`);
136 console.log();
137 }
138 } else {
139 console.log("\n✅ ALL TESTS PASSED!\n");
140 }
141
142 // Summary recommendations
143 console.log("═".repeat(80));
144 console.log("RECOMMENDATIONS");
145 console.log("═".repeat(80));
146
147 if (results.accuracy >= 0.95) {
148 console.log("✅ Excellent! Classifier is performing very well.");
149 } else if (results.accuracy >= 0.85) {
150 console.log("⚠️ Good performance, but room for improvement.");
151 } else {
152 console.log("❌ Poor performance. Significant improvements needed.");
153 }
154
155 if (results.false_negatives > results.false_positives) {
156 console.log("⚠️ More false negatives than false positives.");
157 console.log(" Risk: Missing important emails (they'll be filtered).");
158 console.log(" Recommendation: Add more rules to catch relevant emails.");
159 } else if (results.false_positives > results.false_negatives) {
160 console.log("⚠️ More false positives than false negatives.");
161 console.log(" Risk: Spam getting through to inbox.");
162 console.log(" Recommendation: Tighten rules to reduce false relevance.");
163 }
164
165 if (results.recall < 0.9) {
166 console.log(`⚠️ Low recall (${(results.recall * 100).toFixed(1)}%). Missing too many relevant emails.`);
167 }
168
169 if (results.precision < 0.9) {
170 console.log(`⚠️ Low precision (${(results.precision * 100).toFixed(1)}%). Too many false alarms.`);
171 }
172
173 console.log("═".repeat(80));
174
175 // Return exit code based on performance
176 process.exit(results.accuracy >= 0.90 ? 0 : 1);
177}
178
179evaluate().catch(error => {
180 console.error("Error:", error);
181 process.exit(1);
182});