My agentic slop goes here. Not intended for anyone else!
1(*
2 * Copyright (c) 2014, OCaml.org project
3 * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 *)
17
18let src = Logs.Src.create "river.post" ~doc:"River post processing"
19module Log = (val Logs.src_log src : Logs.LOG)
20
21type t = {
22 id : string;
23 title : string;
24 link : Uri.t option;
25 date : Syndic.Date.t option;
26 feed : Feed.t;
27 author : string;
28 email : string;
29 content : Soup.soup Soup.node;
30 mutable link_response : (string, string) result option;
31 tags : string list;
32 summary : string option;
33}
34
35(** Generate a stable, unique ID from available data *)
36let generate_id ?guid ?link ?title ?date ~feed_url () =
37 match guid with
38 | Some id when id <> "" ->
39 (* Use explicit ID/GUID if available *)
40 id
41 | _ ->
42 match link with
43 | Some uri when Uri.to_string uri <> "" ->
44 (* Use permalink as ID (stable and unique) *)
45 Uri.to_string uri
46 | _ ->
47 (* Fallback: hash of feed_url + title + date *)
48 let title_str = Option.value title ~default:"" in
49 let date_str =
50 match date with
51 | Some d -> Ptime.to_rfc3339 d
52 | None -> ""
53 in
54 let composite = Printf.sprintf "%s|%s|%s" feed_url title_str date_str in
55 (* Use SHA256 for stable hashing *)
56 Digest.string composite |> Digest.to_hex
57
58let post_id post = post.id
59
60let resolve_links_attr ~xmlbase attr el =
61 Soup.R.attribute attr el
62 |> Uri.of_string
63 |> Syndic.XML.resolve ~xmlbase
64 |> Uri.to_string
65 |> fun value -> Soup.set_attribute attr value el
66
67(* Things that posts should not contain *)
68let undesired_tags = [ "style"; "script" ]
69let undesired_attr = [ "id" ]
70
71let html_of_text ?xmlbase s =
72 let soup = Soup.parse s in
73 let ($$) = Soup.($$) in
74 soup $$ "a[href]" |> Soup.iter (resolve_links_attr ~xmlbase "href");
75 soup $$ "img[src]" |> Soup.iter (resolve_links_attr ~xmlbase "src");
76 undesired_tags |> List.iter (fun tag -> soup $$ tag |> Soup.iter Soup.delete);
77 soup $$ "*" |> Soup.iter (fun el ->
78 undesired_attr |> List.iter (fun attr -> Soup.delete_attribute attr el));
79 soup
80
81(* Do not trust sites using XML for HTML content. Convert to string and parse
82 back. (Does not always fix bad HTML unfortunately.) *)
83let html_of_syndic =
84 let ns_prefix _ = Some "" in
85 fun ?xmlbase h ->
86 html_of_text ?xmlbase
87 (String.concat "" (List.map (Syndic.XML.to_string ~ns_prefix) h))
88
89let string_of_option = function None -> "" | Some s -> s
90
91(* Email on the forge contain the name in parenthesis *)
92let forge_name_re = Str.regexp ".*(\\([^()]*\\))"
93
94let post_compare p1 p2 =
95 (* Most recent posts first. Posts with no date are always last *)
96 match (p1.date, p2.date) with
97 | Some d1, Some d2 -> Syndic.Date.compare d2 d1
98 | None, Some _ -> 1
99 | Some _, None -> -1
100 | None, None -> 1
101
102let rec remove n l =
103 if n <= 0 then l else match l with [] -> [] | _ :: tl -> remove (n - 1) tl
104
105let rec take n = function
106 | [] -> []
107 | e :: tl -> if n > 0 then e :: take (n - 1) tl else []
108
109(* Blog feed
110 ***********************************************************************)
111
112let post_of_atom ~(feed : Feed.t) (e : Syndic.Atom.entry) =
113 Log.debug (fun m -> m "Processing Atom entry: %s"
114 (Util.string_of_text_construct e.title));
115
116 let link =
117 try
118 Some
119 (List.find (fun l -> l.Syndic.Atom.rel = Syndic.Atom.Alternate) e.links)
120 .href
121 with Not_found -> (
122 Log.debug (fun m -> m "No alternate link found, trying fallback");
123 match e.links with
124 | l :: _ -> Some l.href
125 | [] -> (
126 match Uri.scheme e.id with
127 | Some "http" -> Some e.id
128 | Some "https" -> Some e.id
129 | _ -> None))
130 in
131 let date =
132 match e.published with Some _ -> e.published | None -> Some e.updated
133 in
134 let content =
135 match e.content with
136 | Some (Text s) -> html_of_text s
137 | Some (Html (xmlbase, s)) -> html_of_text ?xmlbase s
138 | Some (Xhtml (xmlbase, h)) -> html_of_syndic ?xmlbase h
139 | Some (Mime _) | Some (Src _) | None -> (
140 match e.summary with
141 | Some (Text s) -> html_of_text s
142 | Some (Html (xmlbase, s)) -> html_of_text ?xmlbase s
143 | Some (Xhtml (xmlbase, h)) -> html_of_syndic ?xmlbase h
144 | None -> Soup.parse "")
145 in
146 let is_valid_author_name name =
147 (* Filter out empty strings and placeholder values like "Unknown" *)
148 let trimmed = String.trim name in
149 trimmed <> "" && trimmed <> "Unknown"
150 in
151 let author_name =
152 (* Fallback chain for author:
153 1. Entry author (if present, not empty, and not "Unknown")
154 2. Feed-level author (from Atom feed metadata)
155 3. Feed title (from Atom feed metadata)
156 4. Source name (manually entered feed name) *)
157 try
158 let author, _ = e.authors in
159 let trimmed = String.trim author.name in
160 if is_valid_author_name author.name then trimmed
161 else raise Not_found (* Try feed-level author *)
162 with Not_found -> (
163 match feed.content with
164 | Feed.Atom atom_feed -> (
165 (* Try feed-level authors *)
166 match atom_feed.Syndic.Atom.authors with
167 | author :: _ when is_valid_author_name author.name ->
168 String.trim author.name
169 | _ ->
170 (* Use feed title *)
171 Util.string_of_text_construct atom_feed.Syndic.Atom.title)
172 | Feed.Rss2 _ | Feed.Json _ ->
173 (* For RSS2 and JSONFeed, use the feed name which is the source name *)
174 feed.name)
175 in
176 (* Extract tags from Atom categories *)
177 let tags =
178 List.map (fun cat -> cat.Syndic.Atom.term) e.categories
179 in
180 (* Extract summary - convert from text_construct to string *)
181 let summary =
182 match e.summary with
183 | Some s -> Some (Util.string_of_text_construct s)
184 | None -> None
185 in
186 (* Generate unique ID *)
187 let guid = Uri.to_string e.id in
188 let title_str = Util.string_of_text_construct e.title in
189 let id =
190 generate_id ~guid ?link ~title:title_str ?date ~feed_url:feed.url ()
191 in
192 {
193 id;
194 title = title_str;
195 link;
196 date;
197 feed;
198 author = author_name;
199 email = "";
200 content;
201 link_response = None;
202 tags;
203 summary;
204 }
205
206let post_of_rss2 ~(feed : Feed.t) it =
207 let title, content =
208 match it.Syndic.Rss2.story with
209 | All (t, xmlbase, d) -> (
210 ( t,
211 match it.content with
212 | _, "" -> html_of_text ?xmlbase d
213 | xmlbase, c -> html_of_text ?xmlbase c ))
214 | Title t ->
215 let xmlbase, c = it.content in
216 (t, html_of_text ?xmlbase c)
217 | Description (xmlbase, d) -> (
218 ( "",
219 match it.content with
220 | _, "" -> html_of_text ?xmlbase d
221 | xmlbase, c -> html_of_text ?xmlbase c ))
222 in
223 (* Note: it.link is of type Uri.t option in Syndic *)
224 let link =
225 match (it.guid, it.link) with
226 | Some u, _ when u.permalink -> Some u.data
227 | _, Some _ -> it.link
228 | Some u, _ ->
229 (* Sometimes the guid is indicated with isPermaLink="false" but is
230 nonetheless the only URL we get (e.g. ocamlpro). *)
231 Some u.data
232 | None, None -> None
233 in
234 (* Extract GUID string for ID generation *)
235 let guid_str =
236 match it.guid with
237 | Some u -> Some (Uri.to_string u.data)
238 | None -> None
239 in
240 (* RSS2 doesn't have a categories field exposed, use empty list *)
241 let tags = [] in
242 (* RSS2 doesn't have a separate summary field, so leave it empty *)
243 let summary = None in
244 (* Generate unique ID *)
245 let id =
246 generate_id ?guid:guid_str ?link ~title ?date:it.pubDate ~feed_url:feed.url ()
247 in
248 {
249 id;
250 title;
251 link;
252 feed;
253 author = feed.name;
254 email = string_of_option it.author;
255 content;
256 date = it.pubDate;
257 link_response = None;
258 tags;
259 summary;
260 }
261
262let post_of_jsonfeed_item ~(feed : Feed.t) (item : Jsonfeed.Item.t) =
263 Log.debug (fun m -> m "Processing JSONFeed item: %s"
264 (Option.value (Jsonfeed.Item.title item) ~default:"Untitled"));
265
266 (* Extract content - prefer HTML, fall back to text *)
267 let content =
268 match Jsonfeed.Item.content item with
269 | `Html html -> html_of_text html
270 | `Text text -> html_of_text text
271 | `Both (html, _text) -> html_of_text html
272 in
273
274 (* Extract author - use first author if multiple *)
275 let author_name, author_email =
276 match Jsonfeed.Item.authors item with
277 | Some (first :: _) ->
278 let name = Jsonfeed.Author.name first |> Option.value ~default:"" in
279 (* JSONFeed authors don't typically have email *)
280 (name, "")
281 | _ ->
282 (* Fall back to feed-level authors or feed title *)
283 (match feed.content with
284 | Feed.Json jsonfeed ->
285 (match Jsonfeed.authors jsonfeed with
286 | Some (first :: _) ->
287 let name = Jsonfeed.Author.name first |> Option.value ~default:feed.title in
288 (name, "")
289 | _ -> (feed.title, ""))
290 | _ -> (feed.title, ""))
291 in
292
293 (* Link - use url field *)
294 let link =
295 Jsonfeed.Item.url item
296 |> Option.map Uri.of_string
297 in
298
299 (* Date *)
300 let date = Jsonfeed.Item.date_published item in
301
302 (* Summary *)
303 let summary = Jsonfeed.Item.summary item in
304
305 (* Tags *)
306 let tags =
307 Jsonfeed.Item.tags item
308 |> Option.value ~default:[]
309 in
310
311 (* Generate unique ID - JSONFeed items always have an id field (required) *)
312 let guid = Jsonfeed.Item.id item in
313 let title_str = Jsonfeed.Item.title item |> Option.value ~default:"Untitled" in
314 let id =
315 generate_id ~guid ?link ~title:title_str ?date ~feed_url:feed.url ()
316 in
317
318 {
319 id;
320 title = title_str;
321 link;
322 date;
323 feed;
324 author = author_name;
325 email = author_email;
326 content;
327 link_response = None;
328 tags;
329 summary;
330 }
331
332let posts_of_feed c =
333 match c.Feed.content with
334 | Feed.Atom f ->
335 let posts = List.map (post_of_atom ~feed:c) f.Syndic.Atom.entries in
336 Log.debug (fun m -> m "Extracted %d posts from Atom feed '%s'"
337 (List.length posts) c.Feed.name);
338 posts
339 | Feed.Rss2 ch ->
340 let posts = List.map (post_of_rss2 ~feed:c) ch.Syndic.Rss2.items in
341 Log.debug (fun m -> m "Extracted %d posts from RSS2 feed '%s'"
342 (List.length posts) c.Feed.name);
343 posts
344 | Feed.Json jsonfeed ->
345 let items = Jsonfeed.items jsonfeed in
346 let posts = List.map (post_of_jsonfeed_item ~feed:c) items in
347 Log.debug (fun m -> m "Extracted %d posts from JSONFeed '%s'"
348 (List.length posts) c.Feed.name);
349 posts
350
351let mk_entry post =
352 let content = Syndic.Atom.Html (None, Soup.to_string post.content) in
353 let contributors =
354 [ Syndic.Atom.author ~uri:(Uri.of_string post.feed.url) post.feed.name ]
355 in
356 let links =
357 match post.link with
358 | Some l -> [ Syndic.Atom.link ~rel:Syndic.Atom.Alternate l ]
359 | None -> []
360 in
361 (* TODO: include source *)
362 let id =
363 match post.link with
364 | Some l -> l
365 | None -> Uri.of_string (Digest.to_hex (Digest.string post.title))
366 in
367 let authors = (Syndic.Atom.author ~email:post.email post.author, []) in
368 let title : Syndic.Atom.text_construct = Syndic.Atom.Text post.title in
369 let updated =
370 match post.date with
371 (* Atom entry requires a date but RSS2 does not. So if a date
372 * is not available, just capture the current date. *)
373 | None -> Ptime.of_float_s (Unix.gettimeofday ()) |> Option.get
374 | Some d -> d
375 in
376 Syndic.Atom.entry ~content ~contributors ~links ~id ~authors ~title ~updated
377 ()
378
379let mk_entries posts = List.map mk_entry posts
380
381let mk_jsonfeed_item post =
382 (* Convert HTML content back to string *)
383 let html = Soup.to_string post.content in
384 let content = `Html html in
385
386 (* Create author *)
387 let authors =
388 if post.author <> "" then
389 let author = Jsonfeed.Author.create ~name:post.author () in
390 Some [author]
391 else
392 None
393 in
394
395 (* Create item *)
396 Jsonfeed.Item.create
397 ~id:post.id
398 ~content
399 ?url:(Option.map Uri.to_string post.link)
400 ~title:post.title
401 ?summary:post.summary
402 ?date_published:post.date
403 ?authors
404 ~tags:post.tags
405 ()
406
407let mk_jsonfeed_items posts = List.map mk_jsonfeed_item posts
408
409let get_posts ?n ?(ofs = 0) planet_feeds =
410 Log.info (fun m -> m "Processing %d feeds for posts" (List.length planet_feeds));
411
412 let posts = List.concat @@ List.map posts_of_feed planet_feeds in
413 Log.debug (fun m -> m "Total posts collected: %d" (List.length posts));
414
415 let posts = List.sort post_compare posts in
416 Log.debug (fun m -> m "Posts sorted by date (most recent first)");
417
418 let posts = remove ofs posts in
419 let result =
420 match n with
421 | None ->
422 Log.debug (fun m -> m "Returning all %d posts (offset=%d)"
423 (List.length posts) ofs);
424 posts
425 | Some n ->
426 let limited = take n posts in
427 Log.debug (fun m -> m "Returning %d posts (requested=%d, offset=%d)"
428 (List.length limited) n ofs);
429 limited
430 in
431 result
432
433(* Fetch the link response and cache it. *)
434(* TODO: This requires environment for HTTP access
435let fetch_link env t =
436 match (t.link, t.link_response) with
437 | None, _ -> None
438 | Some _, Some (Ok x) -> Some x
439 | Some _, Some (Error _) -> None
440 | Some link, None -> (
441 try
442 let response = Http.get env (Uri.to_string link) in
443 t.link_response <- Some (Ok response);
444 Some response
445 with _exn ->
446 t.link_response <- Some (Error "");
447 None)
448*)
449let fetch_link _ = None