My agentic slop goes here. Not intended for anyone else!
1(** Thread Reconstruction Algorithms Implementation. 2 3 Implements RFC 5256 threading algorithms and custom conversation grouping 4 for organizing emails into discussion threads. 5*) 6 7(* Remove open statement to avoid circular dependency *) 8 9type thread_group = { 10 thread_id : Jmap.Id.t; 11 email_ids : Jmap.Id.t list; 12 root_email_id : Jmap.Id.t option; 13 last_updated : Jmap.Date.t; 14} 15 16type email_relationship = { 17 email_id : Jmap.Id.t; 18 message_id : string option; 19 in_reply_to : string option; 20 references : string list; 21 subject : string; 22 date : Jmap.Date.t; 23} 24 25type algorithm = [ 26 | `RFC5256_REFERENCES 27 | `RFC5256_ORDEREDSUBJECT 28 | `HYBRID 29 | `CONVERSATION 30] 31 32(** Extract email relationship information *) 33let extract_relationships (email : Jmap_email.Email.Email.t) : email_relationship = 34 let email_id = match Jmap_email.Email.Email.id email with 35 | Some id -> id 36 | None -> failwith "Email must have an ID for threading" 37 in 38 39 (* Extract Message-ID header *) 40 let message_id = 41 match Jmap_email.Email.Email.headers email with 42 | Some headers -> 43 (try 44 let msg_id_header = List.find (fun h -> 45 String.lowercase_ascii (Jmap_email.Header.name h) = "message-id" 46 ) headers in 47 Some (Jmap_email.Header.value msg_id_header) 48 with Not_found -> None) 49 | None -> None 50 in 51 52 (* Extract In-Reply-To header *) 53 let in_reply_to = 54 match Jmap_email.Email.Email.headers email with 55 | Some headers -> 56 (try 57 let reply_header = List.find (fun h -> 58 String.lowercase_ascii (Jmap_email.Header.name h) = "in-reply-to" 59 ) headers in 60 Some (Jmap_email.Header.value reply_header) 61 with Not_found -> None) 62 | None -> None 63 in 64 65 (* Extract References header *) 66 let references = 67 match Jmap_email.Email.Email.headers email with 68 | Some headers -> 69 (try 70 let refs_header = List.find (fun h -> 71 String.lowercase_ascii (Jmap_email.Header.name h) = "references" 72 ) headers in 73 (* Split references by whitespace *) 74 String.split_on_char ' ' (Jmap_email.Header.value refs_header) 75 |> List.filter (fun s -> String.length s > 0) 76 with Not_found -> []) 77 | None -> [] 78 in 79 80 (* Get normalized subject *) 81 let subject = match Jmap_email.Email.Email.subject email with 82 | Some s -> s 83 | None -> "" 84 in 85 86 (* Get email date *) 87 let date = match Jmap_email.Email.Email.received_at email with 88 | Some d -> d 89 | None -> Jmap.Date.of_string "1970-01-01T00:00:00Z" |> Result.get_ok 90 in 91 92 { 93 email_id; 94 message_id; 95 in_reply_to; 96 references; 97 subject; 98 date; 99 } 100 101(** Build a thread group from related emails *) 102let build_thread_group (emails : Email.Email.t list) : thread_group = 103 match emails with 104 | [] -> failwith "Cannot build thread group from empty email list" 105 | _ -> 106 (* Generate thread ID from first email or use hash of message IDs *) 107 let thread_id = 108 let first_email = List.hd emails in 109 match Email.Email.id first_email with 110 | Some id -> id (* Use first email's ID as thread ID *) 111 | None -> Jmap.Id.of_string "thread-generated" |> Result.get_ok 112 in 113 114 (* Extract all email IDs *) 115 let email_ids = List.filter_map Email.Email.id emails in 116 117 (* Find root email (earliest without In-Reply-To) *) 118 let root_email_id = 119 let sorted = List.sort (fun e1 e2 -> 120 let d1 = match Email.Email.received_at e1 with 121 | Some d -> d 122 | None -> Jmap.Date.of_string "1970-01-01T00:00:00Z" |> Result.get_ok 123 in 124 let d2 = match Email.Email.received_at e2 with 125 | Some d -> d 126 | None -> Jmap.Date.of_string "1970-01-01T00:00:00Z" |> Result.get_ok 127 in 128 compare (Jmap.Date.to_timestamp d1) (Jmap.Date.to_timestamp d2) 129 ) emails in 130 Email.Email.id (List.hd sorted) 131 in 132 133 (* Find most recent email date *) 134 let last_updated = 135 let dates = List.filter_map Email.Email.received_at emails in 136 let sorted_dates = List.sort (fun d1 d2 -> 137 compare (Jmap.Date.to_timestamp d2) (Jmap.Date.to_timestamp d1) 138 ) dates in 139 match sorted_dates with 140 | d :: _ -> d 141 | [] -> Jmap.Date.of_string "1970-01-01T00:00:00Z" |> Result.get_ok 142 in 143 144 { 145 thread_id; 146 email_ids; 147 root_email_id; 148 last_updated; 149 } 150 151(** Normalize subject for comparison *) 152let normalize_subject subject = 153 let s = String.lowercase_ascii subject in 154 (* Remove common prefixes *) 155 let prefixes = ["re:"; "re :"; "fwd:"; "fwd :"; "fw:"; "fw :"] in 156 let rec remove_prefixes s = function 157 | [] -> s 158 | prefix :: rest -> 159 if String.starts_with ~prefix s then 160 let s' = String.sub s (String.length prefix) (String.length s - String.length prefix) in 161 remove_prefixes (String.trim s') prefixes (* Restart with all prefixes *) 162 else 163 remove_prefixes s rest 164 in 165 let normalized = remove_prefixes (String.trim s) prefixes in 166 (* Collapse whitespace *) 167 String.split_on_char ' ' normalized 168 |> List.filter (fun s -> String.length s > 0) 169 |> String.concat " " 170 171(** Thread by REFERENCES algorithm (RFC 5256) *) 172let thread_by_references emails = 173 (* Build a map of Message-ID to emails *) 174 let message_id_map = Hashtbl.create 100 in 175 let relationships = List.map extract_relationships emails in 176 177 (* Index emails by Message-ID *) 178 List.iter2 (fun email rel -> 179 match rel.message_id with 180 | Some msg_id -> Hashtbl.add message_id_map msg_id email 181 | None -> () 182 ) emails relationships; 183 184 (* Build parent-child relationships *) 185 let thread_groups = Hashtbl.create 50 in 186 let processed = Hashtbl.create 100 in 187 188 List.iter2 (fun email rel -> 189 if not (Hashtbl.mem processed rel.email_id) then begin 190 (* Find thread root by following references *) 191 let thread_emails = ref [email] in 192 193 (* Add emails referenced in References header *) 194 List.iter (fun ref_id -> 195 try 196 let ref_email = Hashtbl.find message_id_map ref_id in 197 if not (List.memq ref_email !thread_emails) then 198 thread_emails := ref_email :: !thread_emails 199 with Not_found -> () 200 ) rel.references; 201 202 (* Add email referenced in In-Reply-To *) 203 (match rel.in_reply_to with 204 | Some reply_id -> 205 (try 206 let parent_email = Hashtbl.find message_id_map reply_id in 207 if not (List.memq parent_email !thread_emails) then 208 thread_emails := parent_email :: !thread_emails 209 with Not_found -> ()) 210 | None -> ()); 211 212 (* Mark all emails as processed *) 213 List.iter (fun e -> 214 match Email.Email.id e with 215 | Some id -> Hashtbl.add processed id true 216 | None -> () 217 ) !thread_emails; 218 219 (* Create thread group *) 220 if List.length !thread_emails > 0 then 221 let group = build_thread_group !thread_emails in 222 Hashtbl.add thread_groups group.thread_id group 223 end 224 ) emails relationships; 225 226 (* Collect all thread groups *) 227 Hashtbl.fold (fun _ group acc -> group :: acc) thread_groups [] 228 229(** Thread by ORDEREDSUBJECT algorithm (RFC 5256) *) 230let thread_by_ordered_subject emails = 231 (* Group emails by normalized subject *) 232 let subject_map = Hashtbl.create 50 in 233 234 List.iter (fun email -> 235 let subject = match Email.Email.subject email with 236 | Some s -> normalize_subject s 237 | None -> "" 238 in 239 let emails_with_subject = 240 try Hashtbl.find subject_map subject 241 with Not_found -> [] 242 in 243 Hashtbl.replace subject_map subject (email :: emails_with_subject) 244 ) emails; 245 246 (* Create thread groups from subject groups *) 247 Hashtbl.fold (fun _ email_list acc -> 248 if List.length email_list > 0 then 249 let sorted_emails = List.sort (fun e1 e2 -> 250 let d1 = match Email.Email.received_at e1 with 251 | Some d -> Jmap.Date.to_timestamp d 252 | None -> 0.0 253 in 254 let d2 = match Email.Email.received_at e2 with 255 | Some d -> Jmap.Date.to_timestamp d 256 | None -> 0.0 257 in 258 compare d1 d2 259 ) email_list in 260 let group = build_thread_group sorted_emails in 261 group :: acc 262 else 263 acc 264 ) subject_map [] 265 266(** Hybrid threading algorithm *) 267let thread_hybrid emails = 268 (* First try REFERENCES algorithm *) 269 let ref_threads = thread_by_references emails in 270 271 (* Collect emails that weren't threaded *) 272 let threaded_ids = Hashtbl.create 100 in 273 List.iter (fun thread -> 274 List.iter (fun id -> Hashtbl.add threaded_ids id true) thread.email_ids 275 ) ref_threads; 276 277 let unthreaded = List.filter (fun email -> 278 match Email.Email.id email with 279 | Some id -> not (Hashtbl.mem threaded_ids id) 280 | None -> false 281 ) emails in 282 283 (* Thread remaining emails by subject *) 284 let subject_threads = thread_by_ordered_subject unthreaded in 285 286 (* Combine results *) 287 ref_threads @ subject_threads 288 289(** Conversation-style threading *) 290let thread_conversations emails = 291 (* Aggressive grouping - combine REFERENCES and subject similarity *) 292 let threads = thread_hybrid emails in 293 294 (* Further merge threads with similar subjects *) 295 let merged = Hashtbl.create 50 in 296 297 List.iter (fun thread -> 298 (* Find if this thread should be merged with an existing one *) 299 let should_merge = ref None in 300 301 Hashtbl.iter (fun tid existing_thread -> 302 (* Check if subjects are similar enough to merge *) 303 if !should_merge = None then begin 304 let thread_emails = List.filter_map (fun id -> 305 List.find_opt (fun e -> 306 match Email.Email.id e with 307 | Some eid -> Jmap.Id.equal eid id 308 | None -> false 309 ) emails 310 ) thread.email_ids in 311 312 let existing_emails = List.filter_map (fun id -> 313 List.find_opt (fun e -> 314 match Email.Email.id e with 315 | Some eid -> Jmap.Id.equal eid id 316 | None -> false 317 ) emails 318 ) existing_thread.email_ids in 319 320 (* Check subject similarity *) 321 let thread_subjects = List.filter_map Email.Email.subject thread_emails 322 |> List.map normalize_subject in 323 let existing_subjects = List.filter_map Email.Email.subject existing_emails 324 |> List.map normalize_subject in 325 326 let common_subjects = List.filter (fun s1 -> 327 List.exists (fun s2 -> s1 = s2) existing_subjects 328 ) thread_subjects in 329 330 if List.length common_subjects > 0 then 331 should_merge := Some tid 332 end 333 ) merged; 334 335 match !should_merge with 336 | Some tid -> 337 (* Merge with existing thread *) 338 let existing = Hashtbl.find merged tid in 339 let merged_thread = { 340 existing with 341 email_ids = existing.email_ids @ thread.email_ids; 342 last_updated = 343 if Jmap.Date.to_timestamp existing.last_updated > Jmap.Date.to_timestamp thread.last_updated 344 then existing.last_updated 345 else thread.last_updated; 346 } in 347 Hashtbl.replace merged tid merged_thread 348 | None -> 349 (* Add as new thread *) 350 Hashtbl.add merged thread.thread_id thread 351 ) threads; 352 353 Hashtbl.fold (fun _ thread acc -> thread :: acc) merged [] 354 355(** Apply specified algorithm *) 356let apply_algorithm algorithm emails = 357 match algorithm with 358 | `RFC5256_REFERENCES -> thread_by_references emails 359 | `RFC5256_ORDEREDSUBJECT -> thread_by_ordered_subject emails 360 | `HYBRID -> thread_hybrid emails 361 | `CONVERSATION -> thread_conversations emails 362 363(** Thread relationship graph *) 364module ThreadGraph = struct 365 type t = { 366 mutable threads : (Jmap.Id.t, thread_group) Hashtbl.t; 367 mutable email_to_thread : (Jmap.Id.t, Jmap.Id.t) Hashtbl.t; 368 mutable next_thread_id : int; 369 } 370 371 let create () = { 372 threads = Hashtbl.create 100; 373 email_to_thread = Hashtbl.create 1000; 374 next_thread_id = 1; 375 } 376 377 let add_email t email = 378 let rel = extract_relationships email in 379 380 (* Check if email belongs to existing thread *) 381 let existing_thread = 382 (* Check by In-Reply-To *) 383 match rel.in_reply_to with 384 | Some reply_id -> 385 (* Find email with this Message-ID *) 386 let parent_thread = ref None in 387 Hashtbl.iter (fun email_id thread_id -> 388 if !parent_thread = None then 389 (* Check if any email in this thread has the Message-ID *) 390 try 391 let thread = Hashtbl.find t.threads thread_id in 392 if List.mem email_id thread.email_ids then 393 parent_thread := Some thread_id 394 with Not_found -> () 395 ) t.email_to_thread; 396 !parent_thread 397 | None -> None 398 in 399 400 match existing_thread with 401 | Some thread_id -> 402 (* Add to existing thread *) 403 let thread = Hashtbl.find t.threads thread_id in 404 let updated_thread = { 405 thread with 406 email_ids = thread.email_ids @ [rel.email_id]; 407 last_updated = 408 if Jmap.Date.to_timestamp thread.last_updated > Jmap.Date.to_timestamp rel.date 409 then thread.last_updated 410 else rel.date; 411 } in 412 Hashtbl.replace t.threads thread_id updated_thread; 413 Hashtbl.add t.email_to_thread rel.email_id thread_id 414 | None -> 415 (* Create new thread *) 416 let thread_id = 417 let id_str = Printf.sprintf "thread-%d" t.next_thread_id in 418 t.next_thread_id <- t.next_thread_id + 1; 419 Jmap.Id.of_string id_str |> Result.get_ok 420 in 421 let new_thread = { 422 thread_id; 423 email_ids = [rel.email_id]; 424 root_email_id = Some rel.email_id; 425 last_updated = rel.date; 426 } in 427 Hashtbl.add t.threads thread_id new_thread; 428 Hashtbl.add t.email_to_thread rel.email_id thread_id; 429 t 430 431 let remove_email t email_id = 432 try 433 let thread_id = Hashtbl.find t.email_to_thread email_id in 434 let thread = Hashtbl.find t.threads thread_id in 435 436 (* Remove email from thread *) 437 let updated_emails = List.filter (fun id -> not (Jmap.Id.equal id email_id)) thread.email_ids in 438 439 if List.length updated_emails = 0 then 440 (* Remove empty thread *) 441 Hashtbl.remove t.threads thread_id 442 else 443 (* Update thread *) 444 let updated_thread = { thread with email_ids = updated_emails } in 445 Hashtbl.replace t.threads thread_id updated_thread; 446 447 Hashtbl.remove t.email_to_thread email_id 448 with Not_found -> (); 449 t 450 451 let find_thread t email_id = 452 try Some (Hashtbl.find t.email_to_thread email_id) 453 with Not_found -> None 454 455 let get_thread_emails t thread_id = 456 try 457 let thread = Hashtbl.find t.threads thread_id in 458 thread.email_ids 459 with Not_found -> [] 460 461 let get_all_threads t = 462 Hashtbl.fold (fun _ thread acc -> thread :: acc) t.threads [] 463 464 let merge_threads t thread1 thread2 = 465 try 466 let t1 = Hashtbl.find t.threads thread1 in 467 let t2 = Hashtbl.find t.threads thread2 in 468 469 (* Merge thread2 into thread1 *) 470 let merged = { 471 t1 with 472 email_ids = t1.email_ids @ t2.email_ids; 473 last_updated = 474 if Jmap.Date.to_timestamp t1.last_updated > Jmap.Date.to_timestamp t2.last_updated 475 then t1.last_updated 476 else t2.last_updated; 477 } in 478 479 Hashtbl.replace t.threads thread1 merged; 480 Hashtbl.remove t.threads thread2; 481 482 (* Update email mappings *) 483 List.iter (fun email_id -> 484 Hashtbl.replace t.email_to_thread email_id thread1 485 ) t2.email_ids 486 with Not_found -> (); 487 t 488 489 let split_thread t thread_id split_point = 490 try 491 let thread = Hashtbl.find t.threads thread_id in 492 493 (* Find split position *) 494 let rec split_at acc = function 495 | [] -> (List.rev acc, []) 496 | (h :: t') as l -> 497 if Jmap.Id.equal h split_point then 498 (List.rev acc, l) 499 else 500 split_at (h :: acc) t' 501 in 502 503 let (before, after) = split_at [] thread.email_ids in 504 505 if List.length after > 0 then begin 506 (* Update original thread *) 507 let updated_thread = { thread with email_ids = before } in 508 Hashtbl.replace t.threads thread_id updated_thread; 509 510 (* Create new thread *) 511 let new_thread_id = 512 let id_str = Printf.sprintf "thread-%d" t.next_thread_id in 513 t.next_thread_id <- t.next_thread_id + 1; 514 Jmap.Id.of_string id_str |> Result.get_ok 515 in 516 let new_thread = { 517 thread_id = new_thread_id; 518 email_ids = after; 519 root_email_id = Some split_point; 520 last_updated = thread.last_updated; 521 } in 522 Hashtbl.add t.threads new_thread_id new_thread; 523 524 (* Update email mappings *) 525 List.iter (fun email_id -> 526 Hashtbl.replace t.email_to_thread email_id new_thread_id 527 ) after 528 end 529 with Not_found -> (); 530 t 531 532 let recalculate t algorithm = 533 (* Collect all emails *) 534 let all_emails = ref [] in 535 Hashtbl.iter (fun email_id _ -> 536 (* Would need actual email objects here *) 537 all_emails := email_id :: !all_emails 538 ) t.email_to_thread; 539 540 (* Clear current state *) 541 Hashtbl.clear t.threads; 542 Hashtbl.clear t.email_to_thread; 543 t.next_thread_id <- 1; 544 545 (* Note: Would need actual email objects to rethread *) 546 (* This is a stub that maintains the structure *) 547 t 548end 549 550(** Check if two emails are related *) 551let are_related email1 email2 = 552 let rel1 = extract_relationships email1 in 553 let rel2 = extract_relationships email2 in 554 555 (* Check direct parent-child relationship *) 556 let direct_relation = 557 match rel1.message_id, rel2.in_reply_to with 558 | Some id1, Some id2 when id1 = id2 -> true 559 | _ -> match rel2.message_id, rel1.in_reply_to with 560 | Some id1, Some id2 when id1 = id2 -> true 561 | _ -> false 562 in 563 564 (* Check if they share references *) 565 let shared_refs = 566 List.exists (fun r1 -> List.mem r1 rel2.references) rel1.references 567 in 568 569 (* Check subject similarity *) 570 let similar_subject = 571 normalize_subject rel1.subject = normalize_subject rel2.subject 572 in 573 574 direct_relation || shared_refs || similar_subject 575 576(** Sort emails within a thread *) 577let sort_thread_emails emails = 578 (* Build parent-child relationships *) 579 let relationships = List.map (fun e -> (e, extract_relationships e)) emails in 580 581 (* Sort by date first *) 582 let sorted = List.sort (fun (_, r1) (_, r2) -> 583 compare (Jmap.Date.to_timestamp r1.date) (Jmap.Date.to_timestamp r2.date) 584 ) relationships in 585 586 List.map fst sorted 587 588(** Calculate threading statistics *) 589let calculate_stats threads = 590 let thread_count = List.length threads in 591 let thread_sizes = List.map (fun t -> List.length t.email_ids) threads in 592 let total_emails = List.fold_left (+) 0 thread_sizes in 593 let avg_size = if thread_count > 0 then float_of_int total_emails /. float_of_int thread_count else 0.0 in 594 let max_size = List.fold_left max 0 thread_sizes in 595 let singletons = List.filter (fun s -> s = 1) thread_sizes |> List.length in 596 let multi = thread_count - singletons in 597 598 [ 599 `ThreadCount thread_count; 600 `AverageThreadSize avg_size; 601 `LargestThread max_size; 602 `SingletonThreads singletons; 603 `MultiEmailThreads multi; 604 ]