My agentic slop goes here. Not intended for anyone else!
at main 6.9 kB view raw
1(* 2 * Copyright (c) 2014, OCaml.org project 3 * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 *) 17 18(** Feed quality analysis. *) 19 20type t = { 21 total_entries : int; 22 entries_with_summary : int; 23 entries_with_author : int; 24 entries_with_date : int; 25 entries_with_content : int; 26 entries_with_tags : int; 27 avg_content_length : float; 28 min_content_length : int; 29 max_content_length : int; 30 posting_frequency_days : float option; 31 quality_score : float; 32} 33 34let make ~total_entries ~entries_with_summary ~entries_with_author 35 ~entries_with_date ~entries_with_content ~entries_with_tags 36 ~avg_content_length ~min_content_length ~max_content_length 37 ~posting_frequency_days ~quality_score = 38 { 39 total_entries; 40 entries_with_summary; 41 entries_with_author; 42 entries_with_date; 43 entries_with_content; 44 entries_with_tags; 45 avg_content_length; 46 min_content_length; 47 max_content_length; 48 posting_frequency_days; 49 quality_score; 50 } 51 52let total_entries t = t.total_entries 53let entries_with_summary t = t.entries_with_summary 54let entries_with_author t = t.entries_with_author 55let entries_with_date t = t.entries_with_date 56let entries_with_content t = t.entries_with_content 57let entries_with_tags t = t.entries_with_tags 58let avg_content_length t = t.avg_content_length 59let min_content_length t = t.min_content_length 60let max_content_length t = t.max_content_length 61let posting_frequency_days t = t.posting_frequency_days 62let quality_score t = t.quality_score 63 64(** Get content length from an Atom entry *) 65let get_content_length (entry : Syndic.Atom.entry) = 66 match entry.content with 67 | Some (Syndic.Atom.Text s) -> String.length s 68 | Some (Syndic.Atom.Html (_, s)) -> String.length s 69 | Some (Syndic.Atom.Xhtml (_, _)) -> 0 (* Could calculate but complex *) 70 | Some (Syndic.Atom.Mime _) -> 0 71 | Some (Syndic.Atom.Src _) -> 0 72 | None -> ( 73 match entry.summary with 74 | Some (Syndic.Atom.Text s) -> String.length s 75 | Some (Syndic.Atom.Html (_, s)) -> String.length s 76 | Some (Syndic.Atom.Xhtml (_, _)) -> 0 77 | None -> 0) 78 79(** Check if entry has non-empty summary *) 80let has_summary (entry : Syndic.Atom.entry) = 81 match entry.summary with 82 | Some (Syndic.Atom.Text s) when String.trim s <> "" -> true 83 | Some (Syndic.Atom.Html (_, s)) when String.trim s <> "" -> true 84 | Some (Syndic.Atom.Xhtml (_, _)) -> true 85 | _ -> false 86 87(** Check if entry has author *) 88let has_author (entry : Syndic.Atom.entry) = 89 let (author, _) = entry.authors in 90 String.trim author.name <> "" 91 92(** Check if entry has content *) 93let has_content (entry : Syndic.Atom.entry) = 94 get_content_length entry > 0 95 96(** Check if entry has tags/categories *) 97let has_tags (entry : Syndic.Atom.entry) = 98 entry.categories <> [] 99 100(** Calculate quality score from metrics *) 101let calculate_quality_score t = 102 let total = float_of_int t.total_entries in 103 if total = 0.0 then 0.0 104 else 105 let summary_pct = float_of_int t.entries_with_summary /. total *. 100.0 in 106 let author_pct = float_of_int t.entries_with_author /. total *. 100.0 in 107 let date_pct = float_of_int t.entries_with_date /. total *. 100.0 in 108 let content_pct = float_of_int t.entries_with_content /. total *. 100.0 in 109 let tags_pct = float_of_int t.entries_with_tags /. total *. 100.0 in 110 111 (* Weighted average: content and dates are most important *) 112 let score = 113 (content_pct *. 0.30) +. 114 (date_pct *. 0.25) +. 115 (author_pct *. 0.20) +. 116 (summary_pct *. 0.15) +. 117 (tags_pct *. 0.10) 118 in 119 score 120 121let analyze entries = 122 if entries = [] then 123 failwith "No entries to analyze" 124 else 125 let total_entries = List.length entries in 126 127 let entries_with_summary = ref 0 in 128 let entries_with_author = ref 0 in 129 let entries_with_date = ref total_entries in (* All Atom entries have updated *) 130 let entries_with_content = ref 0 in 131 let entries_with_tags = ref 0 in 132 let content_lengths = ref [] in 133 let dates = ref [] in 134 135 List.iter (fun (entry : Syndic.Atom.entry) -> 136 if has_summary entry then incr entries_with_summary; 137 if has_author entry then incr entries_with_author; 138 if has_content entry then begin 139 incr entries_with_content; 140 content_lengths := get_content_length entry :: !content_lengths 141 end; 142 if has_tags entry then incr entries_with_tags; 143 dates := entry.updated :: !dates 144 ) entries; 145 146 (* Calculate content statistics *) 147 let avg_content_length, min_content_length, max_content_length = 148 if !content_lengths = [] then 149 (0.0, 0, 0) 150 else 151 let sorted = List.sort compare !content_lengths in 152 let sum = List.fold_left (+) 0 sorted in 153 let avg = float_of_int sum /. float_of_int (List.length sorted) in 154 let min_len = List.hd sorted in 155 let max_len = List.hd (List.rev sorted) in 156 (avg, min_len, max_len) 157 in 158 159 (* Calculate posting frequency *) 160 let posting_frequency_days = 161 if List.length !dates < 2 then 162 None 163 else 164 try 165 let timestamps = List.map Ptime.to_float_s !dates in 166 let sorted_timestamps = List.sort compare timestamps in 167 let first = List.hd sorted_timestamps in 168 let last = List.hd (List.rev sorted_timestamps) in 169 let total_days = (last -. first) /. 86400.0 in 170 let num_intervals = float_of_int (List.length sorted_timestamps - 1) in 171 Some (total_days /. num_intervals) 172 with _ -> None 173 in 174 175 (* Create metrics record (without quality_score first) *) 176 let metrics = { 177 total_entries; 178 entries_with_summary = !entries_with_summary; 179 entries_with_author = !entries_with_author; 180 entries_with_date = !entries_with_date; 181 entries_with_content = !entries_with_content; 182 entries_with_tags = !entries_with_tags; 183 avg_content_length; 184 min_content_length; 185 max_content_length; 186 posting_frequency_days; 187 quality_score = 0.0; (* Placeholder *) 188 } in 189 190 (* Calculate quality score *) 191 let quality_score = calculate_quality_score metrics in 192 { metrics with quality_score }