(** Thread Reconstruction Algorithms Implementation. Implements RFC 5256 threading algorithms and custom conversation grouping for organizing emails into discussion threads. *) (* Remove open statement to avoid circular dependency *) type thread_group = { thread_id : Jmap.Id.t; email_ids : Jmap.Id.t list; root_email_id : Jmap.Id.t option; last_updated : Jmap.Date.t; } type email_relationship = { email_id : Jmap.Id.t; message_id : string option; in_reply_to : string option; references : string list; subject : string; date : Jmap.Date.t; } type algorithm = [ | `RFC5256_REFERENCES | `RFC5256_ORDEREDSUBJECT | `HYBRID | `CONVERSATION ] (** Extract email relationship information *) let extract_relationships (email : Jmap_email.Email.Email.t) : email_relationship = let email_id = match Jmap_email.Email.Email.id email with | Some id -> id | None -> failwith "Email must have an ID for threading" in (* Extract Message-ID header *) let message_id = match Jmap_email.Email.Email.headers email with | Some headers -> (try let msg_id_header = List.find (fun h -> String.lowercase_ascii (Jmap_email.Header.name h) = "message-id" ) headers in Some (Jmap_email.Header.value msg_id_header) with Not_found -> None) | None -> None in (* Extract In-Reply-To header *) let in_reply_to = match Jmap_email.Email.Email.headers email with | Some headers -> (try let reply_header = List.find (fun h -> String.lowercase_ascii (Jmap_email.Header.name h) = "in-reply-to" ) headers in Some (Jmap_email.Header.value reply_header) with Not_found -> None) | None -> None in (* Extract References header *) let references = match Jmap_email.Email.Email.headers email with | Some headers -> (try let refs_header = List.find (fun h -> String.lowercase_ascii (Jmap_email.Header.name h) = "references" ) headers in (* Split references by whitespace *) String.split_on_char ' ' (Jmap_email.Header.value refs_header) |> List.filter (fun s -> String.length s > 0) with Not_found -> []) | None -> [] in (* Get normalized subject *) let subject = match Jmap_email.Email.Email.subject email with | Some s -> s | None -> "" in (* Get email date *) let date = match Jmap_email.Email.Email.received_at email with | Some d -> d | None -> Jmap.Date.of_string "1970-01-01T00:00:00Z" |> Result.get_ok in { email_id; message_id; in_reply_to; references; subject; date; } (** Build a thread group from related emails *) let build_thread_group (emails : Email.Email.t list) : thread_group = match emails with | [] -> failwith "Cannot build thread group from empty email list" | _ -> (* Generate thread ID from first email or use hash of message IDs *) let thread_id = let first_email = List.hd emails in match Email.Email.id first_email with | Some id -> id (* Use first email's ID as thread ID *) | None -> Jmap.Id.of_string "thread-generated" |> Result.get_ok in (* Extract all email IDs *) let email_ids = List.filter_map Email.Email.id emails in (* Find root email (earliest without In-Reply-To) *) let root_email_id = let sorted = List.sort (fun e1 e2 -> let d1 = match Email.Email.received_at e1 with | Some d -> d | None -> Jmap.Date.of_string "1970-01-01T00:00:00Z" |> Result.get_ok in let d2 = match Email.Email.received_at e2 with | Some d -> d | None -> Jmap.Date.of_string "1970-01-01T00:00:00Z" |> Result.get_ok in compare (Jmap.Date.to_timestamp d1) (Jmap.Date.to_timestamp d2) ) emails in Email.Email.id (List.hd sorted) in (* Find most recent email date *) let last_updated = let dates = List.filter_map Email.Email.received_at emails in let sorted_dates = List.sort (fun d1 d2 -> compare (Jmap.Date.to_timestamp d2) (Jmap.Date.to_timestamp d1) ) dates in match sorted_dates with | d :: _ -> d | [] -> Jmap.Date.of_string "1970-01-01T00:00:00Z" |> Result.get_ok in { thread_id; email_ids; root_email_id; last_updated; } (** Normalize subject for comparison *) let normalize_subject subject = let s = String.lowercase_ascii subject in (* Remove common prefixes *) let prefixes = ["re:"; "re :"; "fwd:"; "fwd :"; "fw:"; "fw :"] in let rec remove_prefixes s = function | [] -> s | prefix :: rest -> if String.starts_with ~prefix s then let s' = String.sub s (String.length prefix) (String.length s - String.length prefix) in remove_prefixes (String.trim s') prefixes (* Restart with all prefixes *) else remove_prefixes s rest in let normalized = remove_prefixes (String.trim s) prefixes in (* Collapse whitespace *) String.split_on_char ' ' normalized |> List.filter (fun s -> String.length s > 0) |> String.concat " " (** Thread by REFERENCES algorithm (RFC 5256) *) let thread_by_references emails = (* Build a map of Message-ID to emails *) let message_id_map = Hashtbl.create 100 in let relationships = List.map extract_relationships emails in (* Index emails by Message-ID *) List.iter2 (fun email rel -> match rel.message_id with | Some msg_id -> Hashtbl.add message_id_map msg_id email | None -> () ) emails relationships; (* Build parent-child relationships *) let thread_groups = Hashtbl.create 50 in let processed = Hashtbl.create 100 in List.iter2 (fun email rel -> if not (Hashtbl.mem processed rel.email_id) then begin (* Find thread root by following references *) let thread_emails = ref [email] in (* Add emails referenced in References header *) List.iter (fun ref_id -> try let ref_email = Hashtbl.find message_id_map ref_id in if not (List.memq ref_email !thread_emails) then thread_emails := ref_email :: !thread_emails with Not_found -> () ) rel.references; (* Add email referenced in In-Reply-To *) (match rel.in_reply_to with | Some reply_id -> (try let parent_email = Hashtbl.find message_id_map reply_id in if not (List.memq parent_email !thread_emails) then thread_emails := parent_email :: !thread_emails with Not_found -> ()) | None -> ()); (* Mark all emails as processed *) List.iter (fun e -> match Email.Email.id e with | Some id -> Hashtbl.add processed id true | None -> () ) !thread_emails; (* Create thread group *) if List.length !thread_emails > 0 then let group = build_thread_group !thread_emails in Hashtbl.add thread_groups group.thread_id group end ) emails relationships; (* Collect all thread groups *) Hashtbl.fold (fun _ group acc -> group :: acc) thread_groups [] (** Thread by ORDEREDSUBJECT algorithm (RFC 5256) *) let thread_by_ordered_subject emails = (* Group emails by normalized subject *) let subject_map = Hashtbl.create 50 in List.iter (fun email -> let subject = match Email.Email.subject email with | Some s -> normalize_subject s | None -> "" in let emails_with_subject = try Hashtbl.find subject_map subject with Not_found -> [] in Hashtbl.replace subject_map subject (email :: emails_with_subject) ) emails; (* Create thread groups from subject groups *) Hashtbl.fold (fun _ email_list acc -> if List.length email_list > 0 then let sorted_emails = List.sort (fun e1 e2 -> let d1 = match Email.Email.received_at e1 with | Some d -> Jmap.Date.to_timestamp d | None -> 0.0 in let d2 = match Email.Email.received_at e2 with | Some d -> Jmap.Date.to_timestamp d | None -> 0.0 in compare d1 d2 ) email_list in let group = build_thread_group sorted_emails in group :: acc else acc ) subject_map [] (** Hybrid threading algorithm *) let thread_hybrid emails = (* First try REFERENCES algorithm *) let ref_threads = thread_by_references emails in (* Collect emails that weren't threaded *) let threaded_ids = Hashtbl.create 100 in List.iter (fun thread -> List.iter (fun id -> Hashtbl.add threaded_ids id true) thread.email_ids ) ref_threads; let unthreaded = List.filter (fun email -> match Email.Email.id email with | Some id -> not (Hashtbl.mem threaded_ids id) | None -> false ) emails in (* Thread remaining emails by subject *) let subject_threads = thread_by_ordered_subject unthreaded in (* Combine results *) ref_threads @ subject_threads (** Conversation-style threading *) let thread_conversations emails = (* Aggressive grouping - combine REFERENCES and subject similarity *) let threads = thread_hybrid emails in (* Further merge threads with similar subjects *) let merged = Hashtbl.create 50 in List.iter (fun thread -> (* Find if this thread should be merged with an existing one *) let should_merge = ref None in Hashtbl.iter (fun tid existing_thread -> (* Check if subjects are similar enough to merge *) if !should_merge = None then begin let thread_emails = List.filter_map (fun id -> List.find_opt (fun e -> match Email.Email.id e with | Some eid -> Jmap.Id.equal eid id | None -> false ) emails ) thread.email_ids in let existing_emails = List.filter_map (fun id -> List.find_opt (fun e -> match Email.Email.id e with | Some eid -> Jmap.Id.equal eid id | None -> false ) emails ) existing_thread.email_ids in (* Check subject similarity *) let thread_subjects = List.filter_map Email.Email.subject thread_emails |> List.map normalize_subject in let existing_subjects = List.filter_map Email.Email.subject existing_emails |> List.map normalize_subject in let common_subjects = List.filter (fun s1 -> List.exists (fun s2 -> s1 = s2) existing_subjects ) thread_subjects in if List.length common_subjects > 0 then should_merge := Some tid end ) merged; match !should_merge with | Some tid -> (* Merge with existing thread *) let existing = Hashtbl.find merged tid in let merged_thread = { existing with email_ids = existing.email_ids @ thread.email_ids; last_updated = if Jmap.Date.to_timestamp existing.last_updated > Jmap.Date.to_timestamp thread.last_updated then existing.last_updated else thread.last_updated; } in Hashtbl.replace merged tid merged_thread | None -> (* Add as new thread *) Hashtbl.add merged thread.thread_id thread ) threads; Hashtbl.fold (fun _ thread acc -> thread :: acc) merged [] (** Apply specified algorithm *) let apply_algorithm algorithm emails = match algorithm with | `RFC5256_REFERENCES -> thread_by_references emails | `RFC5256_ORDEREDSUBJECT -> thread_by_ordered_subject emails | `HYBRID -> thread_hybrid emails | `CONVERSATION -> thread_conversations emails (** Thread relationship graph *) module ThreadGraph = struct type t = { mutable threads : (Jmap.Id.t, thread_group) Hashtbl.t; mutable email_to_thread : (Jmap.Id.t, Jmap.Id.t) Hashtbl.t; mutable next_thread_id : int; } let create () = { threads = Hashtbl.create 100; email_to_thread = Hashtbl.create 1000; next_thread_id = 1; } let add_email t email = let rel = extract_relationships email in (* Check if email belongs to existing thread *) let existing_thread = (* Check by In-Reply-To *) match rel.in_reply_to with | Some reply_id -> (* Find email with this Message-ID *) let parent_thread = ref None in Hashtbl.iter (fun email_id thread_id -> if !parent_thread = None then (* Check if any email in this thread has the Message-ID *) try let thread = Hashtbl.find t.threads thread_id in if List.mem email_id thread.email_ids then parent_thread := Some thread_id with Not_found -> () ) t.email_to_thread; !parent_thread | None -> None in match existing_thread with | Some thread_id -> (* Add to existing thread *) let thread = Hashtbl.find t.threads thread_id in let updated_thread = { thread with email_ids = thread.email_ids @ [rel.email_id]; last_updated = if Jmap.Date.to_timestamp thread.last_updated > Jmap.Date.to_timestamp rel.date then thread.last_updated else rel.date; } in Hashtbl.replace t.threads thread_id updated_thread; Hashtbl.add t.email_to_thread rel.email_id thread_id | None -> (* Create new thread *) let thread_id = let id_str = Printf.sprintf "thread-%d" t.next_thread_id in t.next_thread_id <- t.next_thread_id + 1; Jmap.Id.of_string id_str |> Result.get_ok in let new_thread = { thread_id; email_ids = [rel.email_id]; root_email_id = Some rel.email_id; last_updated = rel.date; } in Hashtbl.add t.threads thread_id new_thread; Hashtbl.add t.email_to_thread rel.email_id thread_id; t let remove_email t email_id = try let thread_id = Hashtbl.find t.email_to_thread email_id in let thread = Hashtbl.find t.threads thread_id in (* Remove email from thread *) let updated_emails = List.filter (fun id -> not (Jmap.Id.equal id email_id)) thread.email_ids in if List.length updated_emails = 0 then (* Remove empty thread *) Hashtbl.remove t.threads thread_id else (* Update thread *) let updated_thread = { thread with email_ids = updated_emails } in Hashtbl.replace t.threads thread_id updated_thread; Hashtbl.remove t.email_to_thread email_id with Not_found -> (); t let find_thread t email_id = try Some (Hashtbl.find t.email_to_thread email_id) with Not_found -> None let get_thread_emails t thread_id = try let thread = Hashtbl.find t.threads thread_id in thread.email_ids with Not_found -> [] let get_all_threads t = Hashtbl.fold (fun _ thread acc -> thread :: acc) t.threads [] let merge_threads t thread1 thread2 = try let t1 = Hashtbl.find t.threads thread1 in let t2 = Hashtbl.find t.threads thread2 in (* Merge thread2 into thread1 *) let merged = { t1 with email_ids = t1.email_ids @ t2.email_ids; last_updated = if Jmap.Date.to_timestamp t1.last_updated > Jmap.Date.to_timestamp t2.last_updated then t1.last_updated else t2.last_updated; } in Hashtbl.replace t.threads thread1 merged; Hashtbl.remove t.threads thread2; (* Update email mappings *) List.iter (fun email_id -> Hashtbl.replace t.email_to_thread email_id thread1 ) t2.email_ids with Not_found -> (); t let split_thread t thread_id split_point = try let thread = Hashtbl.find t.threads thread_id in (* Find split position *) let rec split_at acc = function | [] -> (List.rev acc, []) | (h :: t') as l -> if Jmap.Id.equal h split_point then (List.rev acc, l) else split_at (h :: acc) t' in let (before, after) = split_at [] thread.email_ids in if List.length after > 0 then begin (* Update original thread *) let updated_thread = { thread with email_ids = before } in Hashtbl.replace t.threads thread_id updated_thread; (* Create new thread *) let new_thread_id = let id_str = Printf.sprintf "thread-%d" t.next_thread_id in t.next_thread_id <- t.next_thread_id + 1; Jmap.Id.of_string id_str |> Result.get_ok in let new_thread = { thread_id = new_thread_id; email_ids = after; root_email_id = Some split_point; last_updated = thread.last_updated; } in Hashtbl.add t.threads new_thread_id new_thread; (* Update email mappings *) List.iter (fun email_id -> Hashtbl.replace t.email_to_thread email_id new_thread_id ) after end with Not_found -> (); t let recalculate t algorithm = (* Collect all emails *) let all_emails = ref [] in Hashtbl.iter (fun email_id _ -> (* Would need actual email objects here *) all_emails := email_id :: !all_emails ) t.email_to_thread; (* Clear current state *) Hashtbl.clear t.threads; Hashtbl.clear t.email_to_thread; t.next_thread_id <- 1; (* Note: Would need actual email objects to rethread *) (* This is a stub that maintains the structure *) t end (** Check if two emails are related *) let are_related email1 email2 = let rel1 = extract_relationships email1 in let rel2 = extract_relationships email2 in (* Check direct parent-child relationship *) let direct_relation = match rel1.message_id, rel2.in_reply_to with | Some id1, Some id2 when id1 = id2 -> true | _ -> match rel2.message_id, rel1.in_reply_to with | Some id1, Some id2 when id1 = id2 -> true | _ -> false in (* Check if they share references *) let shared_refs = List.exists (fun r1 -> List.mem r1 rel2.references) rel1.references in (* Check subject similarity *) let similar_subject = normalize_subject rel1.subject = normalize_subject rel2.subject in direct_relation || shared_refs || similar_subject (** Sort emails within a thread *) let sort_thread_emails emails = (* Build parent-child relationships *) let relationships = List.map (fun e -> (e, extract_relationships e)) emails in (* Sort by date first *) let sorted = List.sort (fun (_, r1) (_, r2) -> compare (Jmap.Date.to_timestamp r1.date) (Jmap.Date.to_timestamp r2.date) ) relationships in List.map fst sorted (** Calculate threading statistics *) let calculate_stats threads = let thread_count = List.length threads in let thread_sizes = List.map (fun t -> List.length t.email_ids) threads in let total_emails = List.fold_left (+) 0 thread_sizes in let avg_size = if thread_count > 0 then float_of_int total_emails /. float_of_int thread_count else 0.0 in let max_size = List.fold_left max 0 thread_sizes in let singletons = List.filter (fun s -> s = 1) thread_sizes |> List.length in let multi = thread_count - singletons in [ `ThreadCount thread_count; `AverageThreadSize avg_size; `LargestThread max_size; `SingletonThreads singletons; `MultiEmailThreads multi; ]