My agentic slop goes here. Not intended for anyone else!
1(** Thread Reconstruction Algorithms Implementation.
2
3 Implements RFC 5256 threading algorithms and custom conversation grouping
4 for organizing emails into discussion threads.
5*)
6
7(* Remove open statement to avoid circular dependency *)
8
9type thread_group = {
10 thread_id : Jmap.Id.t;
11 email_ids : Jmap.Id.t list;
12 root_email_id : Jmap.Id.t option;
13 last_updated : Jmap.Date.t;
14}
15
16type email_relationship = {
17 email_id : Jmap.Id.t;
18 message_id : string option;
19 in_reply_to : string option;
20 references : string list;
21 subject : string;
22 date : Jmap.Date.t;
23}
24
25type algorithm = [
26 | `RFC5256_REFERENCES
27 | `RFC5256_ORDEREDSUBJECT
28 | `HYBRID
29 | `CONVERSATION
30]
31
32(** Extract email relationship information *)
33let extract_relationships (email : Jmap_email.Email.Email.t) : email_relationship =
34 let email_id = match Jmap_email.Email.Email.id email with
35 | Some id -> id
36 | None -> failwith "Email must have an ID for threading"
37 in
38
39 (* Extract Message-ID header *)
40 let message_id =
41 match Jmap_email.Email.Email.headers email with
42 | Some headers ->
43 (try
44 let msg_id_header = List.find (fun h ->
45 String.lowercase_ascii (Jmap_email.Header.name h) = "message-id"
46 ) headers in
47 Some (Jmap_email.Header.value msg_id_header)
48 with Not_found -> None)
49 | None -> None
50 in
51
52 (* Extract In-Reply-To header *)
53 let in_reply_to =
54 match Jmap_email.Email.Email.headers email with
55 | Some headers ->
56 (try
57 let reply_header = List.find (fun h ->
58 String.lowercase_ascii (Jmap_email.Header.name h) = "in-reply-to"
59 ) headers in
60 Some (Jmap_email.Header.value reply_header)
61 with Not_found -> None)
62 | None -> None
63 in
64
65 (* Extract References header *)
66 let references =
67 match Jmap_email.Email.Email.headers email with
68 | Some headers ->
69 (try
70 let refs_header = List.find (fun h ->
71 String.lowercase_ascii (Jmap_email.Header.name h) = "references"
72 ) headers in
73 (* Split references by whitespace *)
74 String.split_on_char ' ' (Jmap_email.Header.value refs_header)
75 |> List.filter (fun s -> String.length s > 0)
76 with Not_found -> [])
77 | None -> []
78 in
79
80 (* Get normalized subject *)
81 let subject = match Jmap_email.Email.Email.subject email with
82 | Some s -> s
83 | None -> ""
84 in
85
86 (* Get email date *)
87 let date = match Jmap_email.Email.Email.received_at email with
88 | Some d -> d
89 | None -> Jmap.Date.of_string "1970-01-01T00:00:00Z" |> Result.get_ok
90 in
91
92 {
93 email_id;
94 message_id;
95 in_reply_to;
96 references;
97 subject;
98 date;
99 }
100
101(** Build a thread group from related emails *)
102let build_thread_group (emails : Email.Email.t list) : thread_group =
103 match emails with
104 | [] -> failwith "Cannot build thread group from empty email list"
105 | _ ->
106 (* Generate thread ID from first email or use hash of message IDs *)
107 let thread_id =
108 let first_email = List.hd emails in
109 match Email.Email.id first_email with
110 | Some id -> id (* Use first email's ID as thread ID *)
111 | None -> Jmap.Id.of_string "thread-generated" |> Result.get_ok
112 in
113
114 (* Extract all email IDs *)
115 let email_ids = List.filter_map Email.Email.id emails in
116
117 (* Find root email (earliest without In-Reply-To) *)
118 let root_email_id =
119 let sorted = List.sort (fun e1 e2 ->
120 let d1 = match Email.Email.received_at e1 with
121 | Some d -> d
122 | None -> Jmap.Date.of_string "1970-01-01T00:00:00Z" |> Result.get_ok
123 in
124 let d2 = match Email.Email.received_at e2 with
125 | Some d -> d
126 | None -> Jmap.Date.of_string "1970-01-01T00:00:00Z" |> Result.get_ok
127 in
128 compare (Jmap.Date.to_timestamp d1) (Jmap.Date.to_timestamp d2)
129 ) emails in
130 Email.Email.id (List.hd sorted)
131 in
132
133 (* Find most recent email date *)
134 let last_updated =
135 let dates = List.filter_map Email.Email.received_at emails in
136 let sorted_dates = List.sort (fun d1 d2 ->
137 compare (Jmap.Date.to_timestamp d2) (Jmap.Date.to_timestamp d1)
138 ) dates in
139 match sorted_dates with
140 | d :: _ -> d
141 | [] -> Jmap.Date.of_string "1970-01-01T00:00:00Z" |> Result.get_ok
142 in
143
144 {
145 thread_id;
146 email_ids;
147 root_email_id;
148 last_updated;
149 }
150
151(** Normalize subject for comparison *)
152let normalize_subject subject =
153 let s = String.lowercase_ascii subject in
154 (* Remove common prefixes *)
155 let prefixes = ["re:"; "re :"; "fwd:"; "fwd :"; "fw:"; "fw :"] in
156 let rec remove_prefixes s = function
157 | [] -> s
158 | prefix :: rest ->
159 if String.starts_with ~prefix s then
160 let s' = String.sub s (String.length prefix) (String.length s - String.length prefix) in
161 remove_prefixes (String.trim s') prefixes (* Restart with all prefixes *)
162 else
163 remove_prefixes s rest
164 in
165 let normalized = remove_prefixes (String.trim s) prefixes in
166 (* Collapse whitespace *)
167 String.split_on_char ' ' normalized
168 |> List.filter (fun s -> String.length s > 0)
169 |> String.concat " "
170
171(** Thread by REFERENCES algorithm (RFC 5256) *)
172let thread_by_references emails =
173 (* Build a map of Message-ID to emails *)
174 let message_id_map = Hashtbl.create 100 in
175 let relationships = List.map extract_relationships emails in
176
177 (* Index emails by Message-ID *)
178 List.iter2 (fun email rel ->
179 match rel.message_id with
180 | Some msg_id -> Hashtbl.add message_id_map msg_id email
181 | None -> ()
182 ) emails relationships;
183
184 (* Build parent-child relationships *)
185 let thread_groups = Hashtbl.create 50 in
186 let processed = Hashtbl.create 100 in
187
188 List.iter2 (fun email rel ->
189 if not (Hashtbl.mem processed rel.email_id) then begin
190 (* Find thread root by following references *)
191 let thread_emails = ref [email] in
192
193 (* Add emails referenced in References header *)
194 List.iter (fun ref_id ->
195 try
196 let ref_email = Hashtbl.find message_id_map ref_id in
197 if not (List.memq ref_email !thread_emails) then
198 thread_emails := ref_email :: !thread_emails
199 with Not_found -> ()
200 ) rel.references;
201
202 (* Add email referenced in In-Reply-To *)
203 (match rel.in_reply_to with
204 | Some reply_id ->
205 (try
206 let parent_email = Hashtbl.find message_id_map reply_id in
207 if not (List.memq parent_email !thread_emails) then
208 thread_emails := parent_email :: !thread_emails
209 with Not_found -> ())
210 | None -> ());
211
212 (* Mark all emails as processed *)
213 List.iter (fun e ->
214 match Email.Email.id e with
215 | Some id -> Hashtbl.add processed id true
216 | None -> ()
217 ) !thread_emails;
218
219 (* Create thread group *)
220 if List.length !thread_emails > 0 then
221 let group = build_thread_group !thread_emails in
222 Hashtbl.add thread_groups group.thread_id group
223 end
224 ) emails relationships;
225
226 (* Collect all thread groups *)
227 Hashtbl.fold (fun _ group acc -> group :: acc) thread_groups []
228
229(** Thread by ORDEREDSUBJECT algorithm (RFC 5256) *)
230let thread_by_ordered_subject emails =
231 (* Group emails by normalized subject *)
232 let subject_map = Hashtbl.create 50 in
233
234 List.iter (fun email ->
235 let subject = match Email.Email.subject email with
236 | Some s -> normalize_subject s
237 | None -> ""
238 in
239 let emails_with_subject =
240 try Hashtbl.find subject_map subject
241 with Not_found -> []
242 in
243 Hashtbl.replace subject_map subject (email :: emails_with_subject)
244 ) emails;
245
246 (* Create thread groups from subject groups *)
247 Hashtbl.fold (fun _ email_list acc ->
248 if List.length email_list > 0 then
249 let sorted_emails = List.sort (fun e1 e2 ->
250 let d1 = match Email.Email.received_at e1 with
251 | Some d -> Jmap.Date.to_timestamp d
252 | None -> 0.0
253 in
254 let d2 = match Email.Email.received_at e2 with
255 | Some d -> Jmap.Date.to_timestamp d
256 | None -> 0.0
257 in
258 compare d1 d2
259 ) email_list in
260 let group = build_thread_group sorted_emails in
261 group :: acc
262 else
263 acc
264 ) subject_map []
265
266(** Hybrid threading algorithm *)
267let thread_hybrid emails =
268 (* First try REFERENCES algorithm *)
269 let ref_threads = thread_by_references emails in
270
271 (* Collect emails that weren't threaded *)
272 let threaded_ids = Hashtbl.create 100 in
273 List.iter (fun thread ->
274 List.iter (fun id -> Hashtbl.add threaded_ids id true) thread.email_ids
275 ) ref_threads;
276
277 let unthreaded = List.filter (fun email ->
278 match Email.Email.id email with
279 | Some id -> not (Hashtbl.mem threaded_ids id)
280 | None -> false
281 ) emails in
282
283 (* Thread remaining emails by subject *)
284 let subject_threads = thread_by_ordered_subject unthreaded in
285
286 (* Combine results *)
287 ref_threads @ subject_threads
288
289(** Conversation-style threading *)
290let thread_conversations emails =
291 (* Aggressive grouping - combine REFERENCES and subject similarity *)
292 let threads = thread_hybrid emails in
293
294 (* Further merge threads with similar subjects *)
295 let merged = Hashtbl.create 50 in
296
297 List.iter (fun thread ->
298 (* Find if this thread should be merged with an existing one *)
299 let should_merge = ref None in
300
301 Hashtbl.iter (fun tid existing_thread ->
302 (* Check if subjects are similar enough to merge *)
303 if !should_merge = None then begin
304 let thread_emails = List.filter_map (fun id ->
305 List.find_opt (fun e ->
306 match Email.Email.id e with
307 | Some eid -> Jmap.Id.equal eid id
308 | None -> false
309 ) emails
310 ) thread.email_ids in
311
312 let existing_emails = List.filter_map (fun id ->
313 List.find_opt (fun e ->
314 match Email.Email.id e with
315 | Some eid -> Jmap.Id.equal eid id
316 | None -> false
317 ) emails
318 ) existing_thread.email_ids in
319
320 (* Check subject similarity *)
321 let thread_subjects = List.filter_map Email.Email.subject thread_emails
322 |> List.map normalize_subject in
323 let existing_subjects = List.filter_map Email.Email.subject existing_emails
324 |> List.map normalize_subject in
325
326 let common_subjects = List.filter (fun s1 ->
327 List.exists (fun s2 -> s1 = s2) existing_subjects
328 ) thread_subjects in
329
330 if List.length common_subjects > 0 then
331 should_merge := Some tid
332 end
333 ) merged;
334
335 match !should_merge with
336 | Some tid ->
337 (* Merge with existing thread *)
338 let existing = Hashtbl.find merged tid in
339 let merged_thread = {
340 existing with
341 email_ids = existing.email_ids @ thread.email_ids;
342 last_updated =
343 if Jmap.Date.to_timestamp existing.last_updated > Jmap.Date.to_timestamp thread.last_updated
344 then existing.last_updated
345 else thread.last_updated;
346 } in
347 Hashtbl.replace merged tid merged_thread
348 | None ->
349 (* Add as new thread *)
350 Hashtbl.add merged thread.thread_id thread
351 ) threads;
352
353 Hashtbl.fold (fun _ thread acc -> thread :: acc) merged []
354
355(** Apply specified algorithm *)
356let apply_algorithm algorithm emails =
357 match algorithm with
358 | `RFC5256_REFERENCES -> thread_by_references emails
359 | `RFC5256_ORDEREDSUBJECT -> thread_by_ordered_subject emails
360 | `HYBRID -> thread_hybrid emails
361 | `CONVERSATION -> thread_conversations emails
362
363(** Thread relationship graph *)
364module ThreadGraph = struct
365 type t = {
366 mutable threads : (Jmap.Id.t, thread_group) Hashtbl.t;
367 mutable email_to_thread : (Jmap.Id.t, Jmap.Id.t) Hashtbl.t;
368 mutable next_thread_id : int;
369 }
370
371 let create () = {
372 threads = Hashtbl.create 100;
373 email_to_thread = Hashtbl.create 1000;
374 next_thread_id = 1;
375 }
376
377 let add_email t email =
378 let rel = extract_relationships email in
379
380 (* Check if email belongs to existing thread *)
381 let existing_thread =
382 (* Check by In-Reply-To *)
383 match rel.in_reply_to with
384 | Some reply_id ->
385 (* Find email with this Message-ID *)
386 let parent_thread = ref None in
387 Hashtbl.iter (fun email_id thread_id ->
388 if !parent_thread = None then
389 (* Check if any email in this thread has the Message-ID *)
390 try
391 let thread = Hashtbl.find t.threads thread_id in
392 if List.mem email_id thread.email_ids then
393 parent_thread := Some thread_id
394 with Not_found -> ()
395 ) t.email_to_thread;
396 !parent_thread
397 | None -> None
398 in
399
400 match existing_thread with
401 | Some thread_id ->
402 (* Add to existing thread *)
403 let thread = Hashtbl.find t.threads thread_id in
404 let updated_thread = {
405 thread with
406 email_ids = thread.email_ids @ [rel.email_id];
407 last_updated =
408 if Jmap.Date.to_timestamp thread.last_updated > Jmap.Date.to_timestamp rel.date
409 then thread.last_updated
410 else rel.date;
411 } in
412 Hashtbl.replace t.threads thread_id updated_thread;
413 Hashtbl.add t.email_to_thread rel.email_id thread_id
414 | None ->
415 (* Create new thread *)
416 let thread_id =
417 let id_str = Printf.sprintf "thread-%d" t.next_thread_id in
418 t.next_thread_id <- t.next_thread_id + 1;
419 Jmap.Id.of_string id_str |> Result.get_ok
420 in
421 let new_thread = {
422 thread_id;
423 email_ids = [rel.email_id];
424 root_email_id = Some rel.email_id;
425 last_updated = rel.date;
426 } in
427 Hashtbl.add t.threads thread_id new_thread;
428 Hashtbl.add t.email_to_thread rel.email_id thread_id;
429 t
430
431 let remove_email t email_id =
432 try
433 let thread_id = Hashtbl.find t.email_to_thread email_id in
434 let thread = Hashtbl.find t.threads thread_id in
435
436 (* Remove email from thread *)
437 let updated_emails = List.filter (fun id -> not (Jmap.Id.equal id email_id)) thread.email_ids in
438
439 if List.length updated_emails = 0 then
440 (* Remove empty thread *)
441 Hashtbl.remove t.threads thread_id
442 else
443 (* Update thread *)
444 let updated_thread = { thread with email_ids = updated_emails } in
445 Hashtbl.replace t.threads thread_id updated_thread;
446
447 Hashtbl.remove t.email_to_thread email_id
448 with Not_found -> ();
449 t
450
451 let find_thread t email_id =
452 try Some (Hashtbl.find t.email_to_thread email_id)
453 with Not_found -> None
454
455 let get_thread_emails t thread_id =
456 try
457 let thread = Hashtbl.find t.threads thread_id in
458 thread.email_ids
459 with Not_found -> []
460
461 let get_all_threads t =
462 Hashtbl.fold (fun _ thread acc -> thread :: acc) t.threads []
463
464 let merge_threads t thread1 thread2 =
465 try
466 let t1 = Hashtbl.find t.threads thread1 in
467 let t2 = Hashtbl.find t.threads thread2 in
468
469 (* Merge thread2 into thread1 *)
470 let merged = {
471 t1 with
472 email_ids = t1.email_ids @ t2.email_ids;
473 last_updated =
474 if Jmap.Date.to_timestamp t1.last_updated > Jmap.Date.to_timestamp t2.last_updated
475 then t1.last_updated
476 else t2.last_updated;
477 } in
478
479 Hashtbl.replace t.threads thread1 merged;
480 Hashtbl.remove t.threads thread2;
481
482 (* Update email mappings *)
483 List.iter (fun email_id ->
484 Hashtbl.replace t.email_to_thread email_id thread1
485 ) t2.email_ids
486 with Not_found -> ();
487 t
488
489 let split_thread t thread_id split_point =
490 try
491 let thread = Hashtbl.find t.threads thread_id in
492
493 (* Find split position *)
494 let rec split_at acc = function
495 | [] -> (List.rev acc, [])
496 | (h :: t') as l ->
497 if Jmap.Id.equal h split_point then
498 (List.rev acc, l)
499 else
500 split_at (h :: acc) t'
501 in
502
503 let (before, after) = split_at [] thread.email_ids in
504
505 if List.length after > 0 then begin
506 (* Update original thread *)
507 let updated_thread = { thread with email_ids = before } in
508 Hashtbl.replace t.threads thread_id updated_thread;
509
510 (* Create new thread *)
511 let new_thread_id =
512 let id_str = Printf.sprintf "thread-%d" t.next_thread_id in
513 t.next_thread_id <- t.next_thread_id + 1;
514 Jmap.Id.of_string id_str |> Result.get_ok
515 in
516 let new_thread = {
517 thread_id = new_thread_id;
518 email_ids = after;
519 root_email_id = Some split_point;
520 last_updated = thread.last_updated;
521 } in
522 Hashtbl.add t.threads new_thread_id new_thread;
523
524 (* Update email mappings *)
525 List.iter (fun email_id ->
526 Hashtbl.replace t.email_to_thread email_id new_thread_id
527 ) after
528 end
529 with Not_found -> ();
530 t
531
532 let recalculate t algorithm =
533 (* Collect all emails *)
534 let all_emails = ref [] in
535 Hashtbl.iter (fun email_id _ ->
536 (* Would need actual email objects here *)
537 all_emails := email_id :: !all_emails
538 ) t.email_to_thread;
539
540 (* Clear current state *)
541 Hashtbl.clear t.threads;
542 Hashtbl.clear t.email_to_thread;
543 t.next_thread_id <- 1;
544
545 (* Note: Would need actual email objects to rethread *)
546 (* This is a stub that maintains the structure *)
547 t
548end
549
550(** Check if two emails are related *)
551let are_related email1 email2 =
552 let rel1 = extract_relationships email1 in
553 let rel2 = extract_relationships email2 in
554
555 (* Check direct parent-child relationship *)
556 let direct_relation =
557 match rel1.message_id, rel2.in_reply_to with
558 | Some id1, Some id2 when id1 = id2 -> true
559 | _ -> match rel2.message_id, rel1.in_reply_to with
560 | Some id1, Some id2 when id1 = id2 -> true
561 | _ -> false
562 in
563
564 (* Check if they share references *)
565 let shared_refs =
566 List.exists (fun r1 -> List.mem r1 rel2.references) rel1.references
567 in
568
569 (* Check subject similarity *)
570 let similar_subject =
571 normalize_subject rel1.subject = normalize_subject rel2.subject
572 in
573
574 direct_relation || shared_refs || similar_subject
575
576(** Sort emails within a thread *)
577let sort_thread_emails emails =
578 (* Build parent-child relationships *)
579 let relationships = List.map (fun e -> (e, extract_relationships e)) emails in
580
581 (* Sort by date first *)
582 let sorted = List.sort (fun (_, r1) (_, r2) ->
583 compare (Jmap.Date.to_timestamp r1.date) (Jmap.Date.to_timestamp r2.date)
584 ) relationships in
585
586 List.map fst sorted
587
588(** Calculate threading statistics *)
589let calculate_stats threads =
590 let thread_count = List.length threads in
591 let thread_sizes = List.map (fun t -> List.length t.email_ids) threads in
592 let total_emails = List.fold_left (+) 0 thread_sizes in
593 let avg_size = if thread_count > 0 then float_of_int total_emails /. float_of_int thread_count else 0.0 in
594 let max_size = List.fold_left max 0 thread_sizes in
595 let singletons = List.filter (fun s -> s = 1) thread_sizes |> List.length in
596 let multi = thread_count - singletons in
597
598 [
599 `ThreadCount thread_count;
600 `AverageThreadSize avg_size;
601 `LargestThread max_size;
602 `SingletonThreads singletons;
603 `MultiEmailThreads multi;
604 ]