My agentic slop goes here. Not intended for anyone else!
1(*
2 * Copyright (c) 2014, OCaml.org project
3 * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 *)
17
18(** Post representation and extraction from feeds. *)
19
20let src = Logs.Src.create "river" ~doc:"River RSS/Atom aggregator"
21module Log = (val Logs.src_log src : Logs.LOG)
22
23type t = {
24 id : string;
25 title : string;
26 link : Uri.t option;
27 date : Syndic.Date.t option;
28 feed : Feed.t;
29 author : string;
30 email : string;
31 content : Soup.soup Soup.node;
32 mutable link_response : (string, string) result option;
33 tags : string list;
34 summary : string option;
35}
36
37(** Generate a stable, unique ID from available data *)
38let generate_id ?guid ?link ?title ?date ~feed_url () =
39 match guid with
40 | Some id when id <> "" ->
41 (* Use explicit ID/GUID if available *)
42 id
43 | _ ->
44 match link with
45 | Some uri when Uri.to_string uri <> "" ->
46 (* Use permalink as ID (stable and unique) *)
47 Uri.to_string uri
48 | _ ->
49 (* Fallback: hash of feed_url + title + date *)
50 let title_str = Option.value title ~default:"" in
51 let date_str =
52 match date with
53 | Some d -> Ptime.to_rfc3339 d
54 | None -> ""
55 in
56 let composite = Printf.sprintf "%s|%s|%s" feed_url title_str date_str in
57 (* Use SHA256 for stable hashing *)
58 Digest.string composite |> Digest.to_hex
59
60let resolve_links_attr ~xmlbase attr el =
61 Soup.R.attribute attr el
62 |> Uri.of_string
63 |> Syndic.XML.resolve ~xmlbase
64 |> Uri.to_string
65 |> fun value -> Soup.set_attribute attr value el
66
67(* Things that posts should not contain *)
68let undesired_tags = [ "style"; "script" ]
69let undesired_attr = [ "id" ]
70
71let html_of_text ?xmlbase s =
72 let soup = Soup.parse s in
73 let ($$) = Soup.($$) in
74 soup $$ "a[href]" |> Soup.iter (resolve_links_attr ~xmlbase "href");
75 soup $$ "img[src]" |> Soup.iter (resolve_links_attr ~xmlbase "src");
76 undesired_tags |> List.iter (fun tag -> soup $$ tag |> Soup.iter Soup.delete);
77 soup $$ "*" |> Soup.iter (fun el ->
78 undesired_attr |> List.iter (fun attr -> Soup.delete_attribute attr el));
79 soup
80
81(* Do not trust sites using XML for HTML content. Convert to string and parse
82 back. (Does not always fix bad HTML unfortunately.) *)
83let html_of_syndic =
84 let ns_prefix _ = Some "" in
85 fun ?xmlbase h ->
86 html_of_text ?xmlbase
87 (String.concat "" (List.map (Syndic.XML.to_string ~ns_prefix) h))
88
89let string_of_option = function None -> "" | Some s -> s
90
91let post_compare p1 p2 =
92 (* Most recent posts first. Posts with no date are always last *)
93 match (p1.date, p2.date) with
94 | Some d1, Some d2 -> Syndic.Date.compare d2 d1
95 | None, Some _ -> 1
96 | Some _, None -> -1
97 | None, None -> 1
98
99let rec remove n l =
100 if n <= 0 then l else match l with [] -> [] | _ :: tl -> remove (n - 1) tl
101
102let rec take n = function
103 | [] -> []
104 | e :: tl -> if n > 0 then e :: take (n - 1) tl else []
105
106let post_of_atom ~(feed : Feed.t) (e : Syndic.Atom.entry) =
107 Log.debug (fun m -> m "Processing Atom entry: %s"
108 (Text_extract.string_of_text_construct e.title));
109
110 let link =
111 try
112 Some
113 (List.find (fun l -> l.Syndic.Atom.rel = Syndic.Atom.Alternate) e.links)
114 .href
115 with Not_found -> (
116 Log.debug (fun m -> m "No alternate link found, trying fallback");
117 match e.links with
118 | l :: _ -> Some l.href
119 | [] -> (
120 match Uri.scheme e.id with
121 | Some "http" -> Some e.id
122 | Some "https" -> Some e.id
123 | _ -> None))
124 in
125 let date =
126 match e.published with Some _ -> e.published | None -> Some e.updated
127 in
128 let content =
129 match e.content with
130 | Some (Text s) -> html_of_text s
131 | Some (Html (xmlbase, s)) -> html_of_text ?xmlbase s
132 | Some (Xhtml (xmlbase, h)) -> html_of_syndic ?xmlbase h
133 | Some (Mime _) | Some (Src _) | None -> (
134 match e.summary with
135 | Some (Text s) -> html_of_text s
136 | Some (Html (xmlbase, s)) -> html_of_text ?xmlbase s
137 | Some (Xhtml (xmlbase, h)) -> html_of_syndic ?xmlbase h
138 | None -> Soup.parse "")
139 in
140 let is_valid_author_name name =
141 (* Filter out empty strings and placeholder values like "Unknown" *)
142 let trimmed = String.trim name in
143 trimmed <> "" && trimmed <> "Unknown"
144 in
145 let author_name =
146 (* Fallback chain for author:
147 1. Entry author (if present, not empty, and not "Unknown")
148 2. Feed-level author (from Atom feed metadata)
149 3. Feed title (from Atom feed metadata)
150 4. Source name (manually entered feed name) *)
151 try
152 let author, _ = e.authors in
153 let trimmed = String.trim author.name in
154 if is_valid_author_name author.name then trimmed
155 else raise Not_found (* Try feed-level author *)
156 with Not_found -> (
157 (* Feed content is now JSONFeed - try feed-level authors *)
158 let jsonfeed_content = Feed.content feed in
159 match Jsonfeed.authors jsonfeed_content.River_jsonfeed.feed with
160 | Some (first :: _) ->
161 let name = Jsonfeed.Author.name first |> Option.value ~default:"" in
162 if is_valid_author_name name then name
163 else Feed.title feed
164 | _ ->
165 (* Use feed title as fallback *)
166 Feed.title feed)
167 in
168 (* Extract tags from Atom categories *)
169 let tags =
170 List.map (fun cat -> cat.Syndic.Atom.term) e.categories
171 in
172 (* Extract summary - convert from text_construct to string *)
173 let summary =
174 match e.summary with
175 | Some s -> Some (Text_extract.string_of_text_construct s)
176 | None -> None
177 in
178 (* Generate unique ID *)
179 let guid = Uri.to_string e.id in
180 let title_str = Text_extract.string_of_text_construct e.title in
181 let id =
182 generate_id ~guid ?link ~title:title_str ?date
183 ~feed_url:(Source.url (Feed.source feed)) ()
184 in
185 {
186 id;
187 title = title_str;
188 link;
189 date;
190 feed;
191 author = author_name;
192 email = "";
193 content;
194 link_response = None;
195 tags;
196 summary;
197 }
198
199let post_of_rss2 ~(feed : Feed.t) it =
200 let title, content =
201 match it.Syndic.Rss2.story with
202 | All (t, xmlbase, d) -> (
203 ( t,
204 match it.content with
205 | _, "" -> html_of_text ?xmlbase d
206 | xmlbase, c -> html_of_text ?xmlbase c ))
207 | Title t ->
208 let xmlbase, c = it.content in
209 (t, html_of_text ?xmlbase c)
210 | Description (xmlbase, d) -> (
211 ( "",
212 match it.content with
213 | _, "" -> html_of_text ?xmlbase d
214 | xmlbase, c -> html_of_text ?xmlbase c ))
215 in
216 (* Note: it.link is of type Uri.t option in Syndic *)
217 let link =
218 match (it.guid, it.link) with
219 | Some u, _ when u.permalink -> Some u.data
220 | _, Some _ -> it.link
221 | Some u, _ ->
222 (* Sometimes the guid is indicated with isPermaLink="false" but is
223 nonetheless the only URL we get (e.g. ocamlpro). *)
224 Some u.data
225 | None, None -> None
226 in
227 (* Extract GUID string for ID generation *)
228 let guid_str =
229 match it.guid with
230 | Some u -> Some (Uri.to_string u.data)
231 | None -> None
232 in
233 (* RSS2 doesn't have a categories field exposed, use empty list *)
234 let tags = [] in
235 (* RSS2 doesn't have a separate summary field, so leave it empty *)
236 let summary = None in
237 (* Generate unique ID *)
238 let id =
239 generate_id ?guid:guid_str ?link ~title ?date:it.pubDate
240 ~feed_url:(Source.url (Feed.source feed)) ()
241 in
242 {
243 id;
244 title;
245 link;
246 feed;
247 author = Source.name (Feed.source feed);
248 email = string_of_option it.author;
249 content;
250 date = it.pubDate;
251 link_response = None;
252 tags;
253 summary;
254 }
255
256let post_of_jsonfeed_item ~(feed : Feed.t) (item : Jsonfeed.Item.t) =
257 Log.debug (fun m -> m "Processing JSONFeed item: %s"
258 (Option.value (Jsonfeed.Item.title item) ~default:"Untitled"));
259
260 (* Extract content - prefer HTML, fall back to text *)
261 let content =
262 match Jsonfeed.Item.content item with
263 | `Html html -> html_of_text html
264 | `Text text -> html_of_text text
265 | `Both (html, _text) -> html_of_text html
266 in
267
268 (* Extract author - use first author if multiple *)
269 let author_name, author_email =
270 match Jsonfeed.Item.authors item with
271 | Some (first :: _) ->
272 let name = Jsonfeed.Author.name first |> Option.value ~default:"" in
273 (* JSONFeed authors don't typically have email *)
274 (name, "")
275 | _ ->
276 (* Fall back to feed-level authors or feed title *)
277 let jsonfeed_content = Feed.content feed in
278 (match Jsonfeed.authors jsonfeed_content.River_jsonfeed.feed with
279 | Some (first :: _) ->
280 let name = Jsonfeed.Author.name first |> Option.value ~default:(Feed.title feed) in
281 (name, "")
282 | _ -> (Feed.title feed, ""))
283 in
284
285 (* Link - use url field *)
286 let link =
287 Jsonfeed.Item.url item
288 |> Option.map Uri.of_string
289 in
290
291 (* Date *)
292 let date = Jsonfeed.Item.date_published item in
293
294 (* Summary *)
295 let summary = Jsonfeed.Item.summary item in
296
297 (* Tags *)
298 let tags =
299 Jsonfeed.Item.tags item
300 |> Option.value ~default:[]
301 in
302
303 (* Generate unique ID - JSONFeed items always have an id field (required) *)
304 let guid = Jsonfeed.Item.id item in
305 let title_str = Jsonfeed.Item.title item |> Option.value ~default:"Untitled" in
306 let id =
307 generate_id ~guid ?link ~title:title_str ?date
308 ~feed_url:(Source.url (Feed.source feed)) ()
309 in
310
311 {
312 id;
313 title = title_str;
314 link;
315 date;
316 feed;
317 author = author_name;
318 email = author_email;
319 content;
320 link_response = None;
321 tags;
322 summary;
323 }
324
325let posts_of_feed c =
326 (* Feed content is now always JSONFeed *)
327 let jsonfeed_content = Feed.content c in
328 let items = Jsonfeed.items jsonfeed_content.River_jsonfeed.feed in
329 let posts = List.map (post_of_jsonfeed_item ~feed:c) items in
330 Log.debug (fun m -> m "Extracted %d posts from feed '%s' (converted to JSONFeed)"
331 (List.length posts) (Source.name (Feed.source c)));
332 posts
333
334let get_posts ?n ?(ofs = 0) planet_feeds =
335 Log.info (fun m -> m "Processing %d feeds for posts" (List.length planet_feeds));
336
337 let posts = List.concat @@ List.map posts_of_feed planet_feeds in
338 Log.debug (fun m -> m "Total posts collected: %d" (List.length posts));
339
340 let posts = List.sort post_compare posts in
341 Log.debug (fun m -> m "Posts sorted by date (most recent first)");
342
343 let posts = remove ofs posts in
344 let result =
345 match n with
346 | None ->
347 Log.debug (fun m -> m "Returning all %d posts (offset=%d)"
348 (List.length posts) ofs);
349 posts
350 | Some n ->
351 let limited = take n posts in
352 Log.debug (fun m -> m "Returning %d posts (requested=%d, offset=%d)"
353 (List.length limited) n ofs);
354 limited
355 in
356 result
357
358let of_feeds feeds = get_posts feeds
359
360let feed t = t.feed
361let title t = t.title
362let link t = t.link
363let date t = t.date
364let author t = t.author
365let email t = t.email
366let content t = Soup.to_string t.content
367let id t = t.id
368let tags t = t.tags
369let summary t = t.summary
370
371let meta_description _t =
372 (* TODO: This requires environment for HTTP access *)
373 Log.debug (fun m -> m "meta_description not implemented (requires environment)");
374 None
375
376let seo_image _t =
377 (* TODO: This requires environment for HTTP access *)
378 Log.debug (fun m -> m "seo_image not implemented (requires environment)");
379 None