stack/river/lib/post.ml at jsont · anil.recoil.org/slop

anil.recoil.org / slop
My agentic slop goes here. Not intended for anyone else!
slop / stack / river / lib / post.ml
at jsont 14 kB view raw
  1(*
  2 * Copyright (c) 2014, OCaml.org project
  3 * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
  4 *
  5 * Permission to use, copy, modify, and distribute this software for any
  6 * purpose with or without fee is hereby granted, provided that the above
  7 * copyright notice and this permission notice appear in all copies.
  8 *
  9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 16 *)
 17
 18let src = Logs.Src.create "river.post" ~doc:"River post processing"
 19module Log = (val Logs.src_log src : Logs.LOG)
 20
 21type t = {
 22  id : string;
 23  title : string;
 24  link : Uri.t option;
 25  date : Syndic.Date.t option;
 26  feed : Feed.t;
 27  author : string;
 28  email : string;
 29  content : Soup.soup Soup.node;
 30  mutable link_response : (string, string) result option;
 31  tags : string list;
 32  summary : string option;
 33}
 34
 35(** Generate a stable, unique ID from available data *)
 36let generate_id ?guid ?link ?title ?date ~feed_url () =
 37  match guid with
 38  | Some id when id <> "" ->
 39      (* Use explicit ID/GUID if available *)
 40      id
 41  | _ ->
 42      match link with
 43      | Some uri when Uri.to_string uri <> "" ->
 44          (* Use permalink as ID (stable and unique) *)
 45          Uri.to_string uri
 46      | _ ->
 47          (* Fallback: hash of feed_url + title + date *)
 48          let title_str = Option.value title ~default:"" in
 49          let date_str =
 50            match date with
 51            | Some d -> Ptime.to_rfc3339 d
 52            | None -> ""
 53          in
 54          let composite = Printf.sprintf "%s|%s|%s" feed_url title_str date_str in
 55          (* Use SHA256 for stable hashing *)
 56          Digest.string composite |> Digest.to_hex
 57
 58let post_id post = post.id
 59
 60let resolve_links_attr ~xmlbase attr el =
 61  Soup.R.attribute attr el
 62  |> Uri.of_string
 63  |> Syndic.XML.resolve ~xmlbase
 64  |> Uri.to_string
 65  |> fun value -> Soup.set_attribute attr value el
 66
 67(* Things that posts should not contain *)
 68let undesired_tags = [ "style"; "script" ]
 69let undesired_attr = [ "id" ]
 70
 71let html_of_text ?xmlbase s =
 72  let soup = Soup.parse s in
 73  let ($$) = Soup.($$) in
 74  soup $$ "a[href]" |> Soup.iter (resolve_links_attr ~xmlbase "href");
 75  soup $$ "img[src]" |> Soup.iter (resolve_links_attr ~xmlbase "src");
 76  undesired_tags |> List.iter (fun tag -> soup $$ tag |> Soup.iter Soup.delete);
 77  soup $$ "*" |> Soup.iter (fun el ->
 78    undesired_attr |> List.iter (fun attr -> Soup.delete_attribute attr el));
 79  soup
 80
 81(* Do not trust sites using XML for HTML content. Convert to string and parse
 82   back. (Does not always fix bad HTML unfortunately.) *)
 83let html_of_syndic =
 84  let ns_prefix _ = Some "" in
 85  fun ?xmlbase h ->
 86    html_of_text ?xmlbase
 87      (String.concat "" (List.map (Syndic.XML.to_string ~ns_prefix) h))
 88
 89let string_of_option = function None -> "" | Some s -> s
 90
 91(* Email on the forge contain the name in parenthesis *)
 92let forge_name_re = Str.regexp ".*(\\([^()]*\\))"
 93
 94let post_compare p1 p2 =
 95  (* Most recent posts first. Posts with no date are always last *)
 96  match (p1.date, p2.date) with
 97  | Some d1, Some d2 -> Syndic.Date.compare d2 d1
 98  | None, Some _ -> 1
 99  | Some _, None -> -1
100  | None, None -> 1
101
102let rec remove n l =
103  if n <= 0 then l else match l with [] -> [] | _ :: tl -> remove (n - 1) tl
104
105let rec take n = function
106  | [] -> []
107  | e :: tl -> if n > 0 then e :: take (n - 1) tl else []
108
109(* Blog feed
110 ***********************************************************************)
111
112let post_of_atom ~(feed : Feed.t) (e : Syndic.Atom.entry) =
113  Log.debug (fun m -> m "Processing Atom entry: %s"
114    (Util.string_of_text_construct e.title));
115
116  let link =
117    try
118      Some
119        (List.find (fun l -> l.Syndic.Atom.rel = Syndic.Atom.Alternate) e.links)
120          .href
121    with Not_found -> (
122      Log.debug (fun m -> m "No alternate link found, trying fallback");
123      match e.links with
124      | l :: _ -> Some l.href
125      | [] -> (
126          match Uri.scheme e.id with
127          | Some "http" -> Some e.id
128          | Some "https" -> Some e.id
129          | _ -> None))
130  in
131  let date =
132    match e.published with Some _ -> e.published | None -> Some e.updated
133  in
134  let content =
135    match e.content with
136    | Some (Text s) -> html_of_text s
137    | Some (Html (xmlbase, s)) -> html_of_text ?xmlbase s
138    | Some (Xhtml (xmlbase, h)) -> html_of_syndic ?xmlbase h
139    | Some (Mime _) | Some (Src _) | None -> (
140        match e.summary with
141        | Some (Text s) -> html_of_text s
142        | Some (Html (xmlbase, s)) -> html_of_text ?xmlbase s
143        | Some (Xhtml (xmlbase, h)) -> html_of_syndic ?xmlbase h
144        | None -> Soup.parse "")
145  in
146  let is_valid_author_name name =
147    (* Filter out empty strings and placeholder values like "Unknown" *)
148    let trimmed = String.trim name in
149    trimmed <> "" && trimmed <> "Unknown"
150  in
151  let author_name =
152    (* Fallback chain for author:
153       1. Entry author (if present, not empty, and not "Unknown")
154       2. Feed-level author (from Atom feed metadata)
155       3. Feed title (from Atom feed metadata)
156       4. Source name (manually entered feed name) *)
157    try
158      let author, _ = e.authors in
159      let trimmed = String.trim author.name in
160      if is_valid_author_name author.name then trimmed
161      else raise Not_found (* Try feed-level author *)
162    with Not_found -> (
163      match feed.content with
164      | Feed.Atom atom_feed -> (
165          (* Try feed-level authors *)
166          match atom_feed.Syndic.Atom.authors with
167          | author :: _ when is_valid_author_name author.name ->
168              String.trim author.name
169          | _ ->
170              (* Use feed title *)
171              Util.string_of_text_construct atom_feed.Syndic.Atom.title)
172      | Feed.Rss2 _ | Feed.Json _ ->
173          (* For RSS2 and JSONFeed, use the feed name which is the source name *)
174          feed.name)
175  in
176  (* Extract tags from Atom categories *)
177  let tags =
178    List.map (fun cat -> cat.Syndic.Atom.term) e.categories
179  in
180  (* Extract summary - convert from text_construct to string *)
181  let summary =
182    match e.summary with
183    | Some s -> Some (Util.string_of_text_construct s)
184    | None -> None
185  in
186  (* Generate unique ID *)
187  let guid = Uri.to_string e.id in
188  let title_str = Util.string_of_text_construct e.title in
189  let id =
190    generate_id ~guid ?link ~title:title_str ?date ~feed_url:feed.url ()
191  in
192  {
193    id;
194    title = title_str;
195    link;
196    date;
197    feed;
198    author = author_name;
199    email = "";
200    content;
201    link_response = None;
202    tags;
203    summary;
204  }
205
206let post_of_rss2 ~(feed : Feed.t) it =
207  let title, content =
208    match it.Syndic.Rss2.story with
209    | All (t, xmlbase, d) -> (
210        ( t,
211          match it.content with
212          | _, "" -> html_of_text ?xmlbase d
213          | xmlbase, c -> html_of_text ?xmlbase c ))
214    | Title t ->
215        let xmlbase, c = it.content in
216        (t, html_of_text ?xmlbase c)
217    | Description (xmlbase, d) -> (
218        ( "",
219          match it.content with
220          | _, "" -> html_of_text ?xmlbase d
221          | xmlbase, c -> html_of_text ?xmlbase c ))
222  in
223  (* Note: it.link is of type Uri.t option in Syndic *)
224  let link =
225    match (it.guid, it.link) with
226    | Some u, _ when u.permalink -> Some u.data
227    | _, Some _ -> it.link
228    | Some u, _ ->
229        (* Sometimes the guid is indicated with isPermaLink="false" but is
230           nonetheless the only URL we get (e.g. ocamlpro). *)
231        Some u.data
232    | None, None -> None
233  in
234  (* Extract GUID string for ID generation *)
235  let guid_str =
236    match it.guid with
237    | Some u -> Some (Uri.to_string u.data)
238    | None -> None
239  in
240  (* RSS2 doesn't have a categories field exposed, use empty list *)
241  let tags = [] in
242  (* RSS2 doesn't have a separate summary field, so leave it empty *)
243  let summary = None in
244  (* Generate unique ID *)
245  let id =
246    generate_id ?guid:guid_str ?link ~title ?date:it.pubDate ~feed_url:feed.url ()
247  in
248  {
249    id;
250    title;
251    link;
252    feed;
253    author = feed.name;
254    email = string_of_option it.author;
255    content;
256    date = it.pubDate;
257    link_response = None;
258    tags;
259    summary;
260  }
261
262let post_of_jsonfeed_item ~(feed : Feed.t) (item : Jsonfeed.Item.t) =
263  Log.debug (fun m -> m "Processing JSONFeed item: %s"
264    (Option.value (Jsonfeed.Item.title item) ~default:"Untitled"));
265
266  (* Extract content - prefer HTML, fall back to text *)
267  let content =
268    match Jsonfeed.Item.content item with
269    | `Html html -> html_of_text html
270    | `Text text -> html_of_text text
271    | `Both (html, _text) -> html_of_text html
272  in
273
274  (* Extract author - use first author if multiple *)
275  let author_name, author_email =
276    match Jsonfeed.Item.authors item with
277    | Some (first :: _) ->
278        let name = Jsonfeed.Author.name first |> Option.value ~default:"" in
279        (* JSONFeed authors don't typically have email *)
280        (name, "")
281    | _ ->
282        (* Fall back to feed-level authors or feed title *)
283        (match feed.content with
284         | Feed.Json jsonfeed ->
285             (match Jsonfeed.authors jsonfeed with
286              | Some (first :: _) ->
287                  let name = Jsonfeed.Author.name first |> Option.value ~default:feed.title in
288                  (name, "")
289              | _ -> (feed.title, ""))
290         | _ -> (feed.title, ""))
291  in
292
293  (* Link - use url field *)
294  let link =
295    Jsonfeed.Item.url item
296    |> Option.map Uri.of_string
297  in
298
299  (* Date *)
300  let date = Jsonfeed.Item.date_published item in
301
302  (* Summary *)
303  let summary = Jsonfeed.Item.summary item in
304
305  (* Tags *)
306  let tags =
307    Jsonfeed.Item.tags item
308    |> Option.value ~default:[]
309  in
310
311  (* Generate unique ID - JSONFeed items always have an id field (required) *)
312  let guid = Jsonfeed.Item.id item in
313  let title_str = Jsonfeed.Item.title item |> Option.value ~default:"Untitled" in
314  let id =
315    generate_id ~guid ?link ~title:title_str ?date ~feed_url:feed.url ()
316  in
317
318  {
319    id;
320    title = title_str;
321    link;
322    date;
323    feed;
324    author = author_name;
325    email = author_email;
326    content;
327    link_response = None;
328    tags;
329    summary;
330  }
331
332let posts_of_feed c =
333  match c.Feed.content with
334  | Feed.Atom f ->
335      let posts = List.map (post_of_atom ~feed:c) f.Syndic.Atom.entries in
336      Log.debug (fun m -> m "Extracted %d posts from Atom feed '%s'"
337        (List.length posts) c.Feed.name);
338      posts
339  | Feed.Rss2 ch ->
340      let posts = List.map (post_of_rss2 ~feed:c) ch.Syndic.Rss2.items in
341      Log.debug (fun m -> m "Extracted %d posts from RSS2 feed '%s'"
342        (List.length posts) c.Feed.name);
343      posts
344  | Feed.Json jsonfeed ->
345      let items = Jsonfeed.items jsonfeed in
346      let posts = List.map (post_of_jsonfeed_item ~feed:c) items in
347      Log.debug (fun m -> m "Extracted %d posts from JSONFeed '%s'"
348        (List.length posts) c.Feed.name);
349      posts
350
351let mk_entry post =
352  let content = Syndic.Atom.Html (None, Soup.to_string post.content) in
353  let contributors =
354    [ Syndic.Atom.author ~uri:(Uri.of_string post.feed.url) post.feed.name ]
355  in
356  let links =
357    match post.link with
358    | Some l -> [ Syndic.Atom.link ~rel:Syndic.Atom.Alternate l ]
359    | None -> []
360  in
361  (* TODO: include source *)
362  let id =
363    match post.link with
364    | Some l -> l
365    | None -> Uri.of_string (Digest.to_hex (Digest.string post.title))
366  in
367  let authors = (Syndic.Atom.author ~email:post.email post.author, []) in
368  let title : Syndic.Atom.text_construct = Syndic.Atom.Text post.title in
369  let updated =
370    match post.date with
371    (* Atom entry requires a date but RSS2 does not. So if a date
372     * is not available, just capture the current date. *)
373    | None -> Ptime.of_float_s (Unix.gettimeofday ()) |> Option.get
374    | Some d -> d
375  in
376  Syndic.Atom.entry ~content ~contributors ~links ~id ~authors ~title ~updated
377    ()
378
379let mk_entries posts = List.map mk_entry posts
380
381let mk_jsonfeed_item post =
382  (* Convert HTML content back to string *)
383  let html = Soup.to_string post.content in
384  let content = `Html html in
385
386  (* Create author *)
387  let authors =
388    if post.author <> "" then
389      let author = Jsonfeed.Author.create ~name:post.author () in
390      Some [author]
391    else
392      None
393  in
394
395  (* Create item *)
396  Jsonfeed.Item.create
397    ~id:post.id
398    ~content
399    ?url:(Option.map Uri.to_string post.link)
400    ~title:post.title
401    ?summary:post.summary
402    ?date_published:post.date
403    ?authors
404    ~tags:post.tags
405    ()
406
407let mk_jsonfeed_items posts = List.map mk_jsonfeed_item posts
408
409let get_posts ?n ?(ofs = 0) planet_feeds =
410  Log.info (fun m -> m "Processing %d feeds for posts" (List.length planet_feeds));
411
412  let posts = List.concat @@ List.map posts_of_feed planet_feeds in
413  Log.debug (fun m -> m "Total posts collected: %d" (List.length posts));
414
415  let posts = List.sort post_compare posts in
416  Log.debug (fun m -> m "Posts sorted by date (most recent first)");
417
418  let posts = remove ofs posts in
419  let result =
420    match n with
421    | None ->
422        Log.debug (fun m -> m "Returning all %d posts (offset=%d)"
423          (List.length posts) ofs);
424        posts
425    | Some n ->
426        let limited = take n posts in
427        Log.debug (fun m -> m "Returning %d posts (requested=%d, offset=%d)"
428          (List.length limited) n ofs);
429        limited
430  in
431  result
432
433(* Fetch the link response and cache it. *)
434(* TODO: This requires environment for HTTP access
435let fetch_link env t =
436  match (t.link, t.link_response) with
437  | None, _ -> None
438  | Some _, Some (Ok x) -> Some x
439  | Some _, Some (Error _) -> None
440  | Some link, None -> (
441      try
442        let response = Http.get env (Uri.to_string link) in
443        t.link_response <- Some (Ok response);
444        Some response
445      with _exn ->
446        t.link_response <- Some (Error "");
447        None)
448*)
449let fetch_link _ = None