this repo has no description
at main 6.5 kB view raw
1#!/usr/bin/env bun 2// Evaluate classifier performance against labeled test data 3 4import { readFile } from "fs/promises"; 5import { EmailClassifier } from "./classifier.ts"; 6import type { LabeledEmail, TestCase, TestResult, ClassificationResult } from "./types.ts"; 7 8interface LabeledData { 9 source_file: string; 10 labeled_at: string; 11 total_count: number; 12 labeled_count: number; 13 emails: LabeledEmail[]; 14} 15 16async function evaluate() { 17 console.log("📊 Evaluating Email Classifier\n"); 18 19 // Load labeled data 20 const args = process.argv.slice(2); 21 const labeledFile = args[0] || "data/labeled-emails.json"; 22 const data: LabeledData = JSON.parse(await readFile(labeledFile, "utf-8")); 23 24 // Filter to only labeled emails 25 const labeled = data.emails.filter(e => e.pertains !== undefined); 26 27 console.log(`Loaded ${labeled.length} labeled emails`); 28 console.log(` Relevant: ${labeled.filter(e => e.pertains).length}`); 29 console.log(` Not relevant: ${labeled.filter(e => !e.pertains).length}\n`); 30 31 // Create test cases 32 const testCases: TestCase[] = labeled.map(e => ({ 33 input: { 34 subject: e.subject, 35 from: e.from, 36 to: e.to, 37 cc: e.cc, 38 body: e.body, 39 date: e.date 40 }, 41 expected: { 42 pertains: e.pertains!, 43 reason: e.reason || "" 44 }, 45 metadata: { 46 thread_id: e.thread_id, 47 date: e.date, 48 confidence: e.confidence || "unknown", 49 notes: e.notes, 50 labels: e.labels 51 } 52 })); 53 54 // Run classifier on all test cases 55 const classifier = new EmailClassifier(); 56 const results: TestResult = { 57 total: testCases.length, 58 correct: 0, 59 incorrect: 0, 60 accuracy: 0, 61 false_positives: 0, 62 false_negatives: 0, 63 precision: 0, 64 recall: 0, 65 f1_score: 0, 66 failures: [] 67 }; 68 69 let truePositives = 0; 70 let trueNegatives = 0; 71 72 for (const testCase of testCases) { 73 const actual = classifier.classify(testCase.input); 74 const expected = testCase.expected.pertains; 75 const predicted = actual.pertains; 76 77 if (predicted === expected) { 78 results.correct++; 79 if (predicted) truePositives++; 80 else trueNegatives++; 81 } else { 82 results.incorrect++; 83 results.failures.push({ test_case: testCase, actual }); 84 85 if (predicted && !expected) { 86 results.false_positives++; // Said relevant when not 87 } else { 88 results.false_negatives++; // Said not relevant when is 89 } 90 } 91 } 92 93 // Calculate metrics 94 results.accuracy = results.correct / results.total; 95 96 const totalPredictedPositive = truePositives + results.false_positives; 97 const totalActualPositive = truePositives + results.false_negatives; 98 99 results.precision = totalPredictedPositive > 0 ? truePositives / totalPredictedPositive : 0; 100 results.recall = totalActualPositive > 0 ? truePositives / totalActualPositive : 0; 101 results.f1_score = (results.precision + results.recall) > 0 102 ? 2 * (results.precision * results.recall) / (results.precision + results.recall) 103 : 0; 104 105 // Print results 106 console.log("═".repeat(80)); 107 console.log("EVALUATION RESULTS"); 108 console.log("═".repeat(80)); 109 console.log(`Total test cases: ${results.total}`); 110 console.log(`Correct: ${results.correct} (${(results.accuracy * 100).toFixed(1)}%)`); 111 console.log(`Incorrect: ${results.incorrect}`); 112 console.log(` False positives: ${results.false_positives} (said relevant when not)`); 113 console.log(` False negatives: ${results.false_negatives} (said not relevant when is)`); 114 console.log(); 115 console.log(`Accuracy: ${(results.accuracy * 100).toFixed(1)}%`); 116 console.log(`Precision: ${(results.precision * 100).toFixed(1)}% (of predicted relevant, % correct)`); 117 console.log(`Recall: ${(results.recall * 100).toFixed(1)}% (of actual relevant, % found)`); 118 console.log(`F1 Score: ${(results.f1_score * 100).toFixed(1)}%`); 119 console.log("═".repeat(80)); 120 121 // Show failures 122 if (results.failures.length > 0) { 123 console.log("\n❌ FAILURES:\n"); 124 for (let i = 0; i < results.failures.length; i++) { 125 const failure = results.failures[i]; 126 const expected = failure.test_case.expected.pertains; 127 const actual = failure.actual.pertains; 128 129 console.log(`${i + 1}. ${actual ? "FALSE POSITIVE" : "FALSE NEGATIVE"}`); 130 console.log(` Subject: ${failure.test_case.input.subject}`); 131 console.log(` From: ${failure.test_case.input.from}`); 132 console.log(` Expected: ${expected ? "RELEVANT" : "NOT RELEVANT"} (${failure.test_case.expected.reason})`); 133 console.log(` Got: ${actual ? "RELEVANT" : "NOT RELEVANT"} (${failure.actual.reason})`); 134 console.log(` Confidence: ${(failure.actual.confidence * 100).toFixed(0)}%`); 135 console.log(` Rules: ${failure.actual.matched_rules?.join(", ") || "none"}`); 136 console.log(); 137 } 138 } else { 139 console.log("\n✅ ALL TESTS PASSED!\n"); 140 } 141 142 // Summary recommendations 143 console.log("═".repeat(80)); 144 console.log("RECOMMENDATIONS"); 145 console.log("═".repeat(80)); 146 147 if (results.accuracy >= 0.95) { 148 console.log("✅ Excellent! Classifier is performing very well."); 149 } else if (results.accuracy >= 0.85) { 150 console.log("⚠️ Good performance, but room for improvement."); 151 } else { 152 console.log("❌ Poor performance. Significant improvements needed."); 153 } 154 155 if (results.false_negatives > results.false_positives) { 156 console.log("⚠️ More false negatives than false positives."); 157 console.log(" Risk: Missing important emails (they'll be filtered)."); 158 console.log(" Recommendation: Add more rules to catch relevant emails."); 159 } else if (results.false_positives > results.false_negatives) { 160 console.log("⚠️ More false positives than false negatives."); 161 console.log(" Risk: Spam getting through to inbox."); 162 console.log(" Recommendation: Tighten rules to reduce false relevance."); 163 } 164 165 if (results.recall < 0.9) { 166 console.log(`⚠️ Low recall (${(results.recall * 100).toFixed(1)}%). Missing too many relevant emails.`); 167 } 168 169 if (results.precision < 0.9) { 170 console.log(`⚠️ Low precision (${(results.precision * 100).toFixed(1)}%). Too many false alarms.`); 171 } 172 173 console.log("═".repeat(80)); 174 175 // Return exit code based on performance 176 process.exit(results.accuracy >= 0.90 ? 0 : 1); 177} 178 179evaluate().catch(error => { 180 console.error("Error:", error); 181 process.exit(1); 182});