My agentic slop goes here. Not intended for anyone else!
1(*
2 * Copyright (c) 2014, OCaml.org project
3 * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 *)
17
18(** Feed quality analysis. *)
19
20type t = {
21 total_entries : int;
22 entries_with_summary : int;
23 entries_with_author : int;
24 entries_with_date : int;
25 entries_with_content : int;
26 entries_with_tags : int;
27 avg_content_length : float;
28 min_content_length : int;
29 max_content_length : int;
30 posting_frequency_days : float option;
31 quality_score : float;
32}
33
34let make ~total_entries ~entries_with_summary ~entries_with_author
35 ~entries_with_date ~entries_with_content ~entries_with_tags
36 ~avg_content_length ~min_content_length ~max_content_length
37 ~posting_frequency_days ~quality_score =
38 {
39 total_entries;
40 entries_with_summary;
41 entries_with_author;
42 entries_with_date;
43 entries_with_content;
44 entries_with_tags;
45 avg_content_length;
46 min_content_length;
47 max_content_length;
48 posting_frequency_days;
49 quality_score;
50 }
51
52let total_entries t = t.total_entries
53let entries_with_summary t = t.entries_with_summary
54let entries_with_author t = t.entries_with_author
55let entries_with_date t = t.entries_with_date
56let entries_with_content t = t.entries_with_content
57let entries_with_tags t = t.entries_with_tags
58let avg_content_length t = t.avg_content_length
59let min_content_length t = t.min_content_length
60let max_content_length t = t.max_content_length
61let posting_frequency_days t = t.posting_frequency_days
62let quality_score t = t.quality_score
63
64(** Get content length from an Atom entry *)
65let get_content_length (entry : Syndic.Atom.entry) =
66 match entry.content with
67 | Some (Syndic.Atom.Text s) -> String.length s
68 | Some (Syndic.Atom.Html (_, s)) -> String.length s
69 | Some (Syndic.Atom.Xhtml (_, _)) -> 0 (* Could calculate but complex *)
70 | Some (Syndic.Atom.Mime _) -> 0
71 | Some (Syndic.Atom.Src _) -> 0
72 | None -> (
73 match entry.summary with
74 | Some (Syndic.Atom.Text s) -> String.length s
75 | Some (Syndic.Atom.Html (_, s)) -> String.length s
76 | Some (Syndic.Atom.Xhtml (_, _)) -> 0
77 | None -> 0)
78
79(** Check if entry has non-empty summary *)
80let has_summary (entry : Syndic.Atom.entry) =
81 match entry.summary with
82 | Some (Syndic.Atom.Text s) when String.trim s <> "" -> true
83 | Some (Syndic.Atom.Html (_, s)) when String.trim s <> "" -> true
84 | Some (Syndic.Atom.Xhtml (_, _)) -> true
85 | _ -> false
86
87(** Check if entry has author *)
88let has_author (entry : Syndic.Atom.entry) =
89 let (author, _) = entry.authors in
90 String.trim author.name <> ""
91
92(** Check if entry has content *)
93let has_content (entry : Syndic.Atom.entry) =
94 get_content_length entry > 0
95
96(** Check if entry has tags/categories *)
97let has_tags (entry : Syndic.Atom.entry) =
98 entry.categories <> []
99
100(** Calculate quality score from metrics *)
101let calculate_quality_score t =
102 let total = float_of_int t.total_entries in
103 if total = 0.0 then 0.0
104 else
105 let summary_pct = float_of_int t.entries_with_summary /. total *. 100.0 in
106 let author_pct = float_of_int t.entries_with_author /. total *. 100.0 in
107 let date_pct = float_of_int t.entries_with_date /. total *. 100.0 in
108 let content_pct = float_of_int t.entries_with_content /. total *. 100.0 in
109 let tags_pct = float_of_int t.entries_with_tags /. total *. 100.0 in
110
111 (* Weighted average: content and dates are most important *)
112 let score =
113 (content_pct *. 0.30) +.
114 (date_pct *. 0.25) +.
115 (author_pct *. 0.20) +.
116 (summary_pct *. 0.15) +.
117 (tags_pct *. 0.10)
118 in
119 score
120
121let analyze entries =
122 if entries = [] then
123 failwith "No entries to analyze"
124 else
125 let total_entries = List.length entries in
126
127 let entries_with_summary = ref 0 in
128 let entries_with_author = ref 0 in
129 let entries_with_date = ref total_entries in (* All Atom entries have updated *)
130 let entries_with_content = ref 0 in
131 let entries_with_tags = ref 0 in
132 let content_lengths = ref [] in
133 let dates = ref [] in
134
135 List.iter (fun (entry : Syndic.Atom.entry) ->
136 if has_summary entry then incr entries_with_summary;
137 if has_author entry then incr entries_with_author;
138 if has_content entry then begin
139 incr entries_with_content;
140 content_lengths := get_content_length entry :: !content_lengths
141 end;
142 if has_tags entry then incr entries_with_tags;
143 dates := entry.updated :: !dates
144 ) entries;
145
146 (* Calculate content statistics *)
147 let avg_content_length, min_content_length, max_content_length =
148 if !content_lengths = [] then
149 (0.0, 0, 0)
150 else
151 let sorted = List.sort compare !content_lengths in
152 let sum = List.fold_left (+) 0 sorted in
153 let avg = float_of_int sum /. float_of_int (List.length sorted) in
154 let min_len = List.hd sorted in
155 let max_len = List.hd (List.rev sorted) in
156 (avg, min_len, max_len)
157 in
158
159 (* Calculate posting frequency *)
160 let posting_frequency_days =
161 if List.length !dates < 2 then
162 None
163 else
164 try
165 let timestamps = List.map Ptime.to_float_s !dates in
166 let sorted_timestamps = List.sort compare timestamps in
167 let first = List.hd sorted_timestamps in
168 let last = List.hd (List.rev sorted_timestamps) in
169 let total_days = (last -. first) /. 86400.0 in
170 let num_intervals = float_of_int (List.length sorted_timestamps - 1) in
171 Some (total_days /. num_intervals)
172 with _ -> None
173 in
174
175 (* Create metrics record (without quality_score first) *)
176 let metrics = {
177 total_entries;
178 entries_with_summary = !entries_with_summary;
179 entries_with_author = !entries_with_author;
180 entries_with_date = !entries_with_date;
181 entries_with_content = !entries_with_content;
182 entries_with_tags = !entries_with_tags;
183 avg_content_length;
184 min_content_length;
185 max_content_length;
186 posting_frequency_days;
187 quality_score = 0.0; (* Placeholder *)
188 } in
189
190 (* Calculate quality score *)
191 let quality_score = calculate_quality_score metrics in
192 { metrics with quality_score }