stack/river/lib/post.ml at main · anil.recoil.org/slop

anil.recoil.org / slop
My agentic slop goes here. Not intended for anyone else!
slop / stack / river / lib / post.ml
at main 12 kB view raw
  1(*
  2 * Copyright (c) 2014, OCaml.org project
  3 * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
  4 *
  5 * Permission to use, copy, modify, and distribute this software for any
  6 * purpose with or without fee is hereby granted, provided that the above
  7 * copyright notice and this permission notice appear in all copies.
  8 *
  9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 16 *)
 17
 18(** Post representation and extraction from feeds. *)
 19
 20let src = Logs.Src.create "river" ~doc:"River RSS/Atom aggregator"
 21module Log = (val Logs.src_log src : Logs.LOG)
 22
 23type t = {
 24  id : string;
 25  title : string;
 26  link : Uri.t option;
 27  date : Syndic.Date.t option;
 28  feed : Feed.t;
 29  author : string;
 30  email : string;
 31  content : Soup.soup Soup.node;
 32  mutable link_response : (string, string) result option;
 33  tags : string list;
 34  summary : string option;
 35}
 36
 37(** Generate a stable, unique ID from available data *)
 38let generate_id ?guid ?link ?title ?date ~feed_url () =
 39  match guid with
 40  | Some id when id <> "" ->
 41      (* Use explicit ID/GUID if available *)
 42      id
 43  | _ ->
 44      match link with
 45      | Some uri when Uri.to_string uri <> "" ->
 46          (* Use permalink as ID (stable and unique) *)
 47          Uri.to_string uri
 48      | _ ->
 49          (* Fallback: hash of feed_url + title + date *)
 50          let title_str = Option.value title ~default:"" in
 51          let date_str =
 52            match date with
 53            | Some d -> Ptime.to_rfc3339 d
 54            | None -> ""
 55          in
 56          let composite = Printf.sprintf "%s|%s|%s" feed_url title_str date_str in
 57          (* Use SHA256 for stable hashing *)
 58          Digest.string composite |> Digest.to_hex
 59
 60let resolve_links_attr ~xmlbase attr el =
 61  Soup.R.attribute attr el
 62  |> Uri.of_string
 63  |> Syndic.XML.resolve ~xmlbase
 64  |> Uri.to_string
 65  |> fun value -> Soup.set_attribute attr value el
 66
 67(* Things that posts should not contain *)
 68let undesired_tags = [ "style"; "script" ]
 69let undesired_attr = [ "id" ]
 70
 71let html_of_text ?xmlbase s =
 72  let soup = Soup.parse s in
 73  let ($$) = Soup.($$) in
 74  soup $$ "a[href]" |> Soup.iter (resolve_links_attr ~xmlbase "href");
 75  soup $$ "img[src]" |> Soup.iter (resolve_links_attr ~xmlbase "src");
 76  undesired_tags |> List.iter (fun tag -> soup $$ tag |> Soup.iter Soup.delete);
 77  soup $$ "*" |> Soup.iter (fun el ->
 78    undesired_attr |> List.iter (fun attr -> Soup.delete_attribute attr el));
 79  soup
 80
 81(* Do not trust sites using XML for HTML content. Convert to string and parse
 82   back. (Does not always fix bad HTML unfortunately.) *)
 83let html_of_syndic =
 84  let ns_prefix _ = Some "" in
 85  fun ?xmlbase h ->
 86    html_of_text ?xmlbase
 87      (String.concat "" (List.map (Syndic.XML.to_string ~ns_prefix) h))
 88
 89let string_of_option = function None -> "" | Some s -> s
 90
 91let post_compare p1 p2 =
 92  (* Most recent posts first. Posts with no date are always last *)
 93  match (p1.date, p2.date) with
 94  | Some d1, Some d2 -> Syndic.Date.compare d2 d1
 95  | None, Some _ -> 1
 96  | Some _, None -> -1
 97  | None, None -> 1
 98
 99let rec remove n l =
100  if n <= 0 then l else match l with [] -> [] | _ :: tl -> remove (n - 1) tl
101
102let rec take n = function
103  | [] -> []
104  | e :: tl -> if n > 0 then e :: take (n - 1) tl else []
105
106let post_of_atom ~(feed : Feed.t) (e : Syndic.Atom.entry) =
107  Log.debug (fun m -> m "Processing Atom entry: %s"
108    (Text_extract.string_of_text_construct e.title));
109
110  let link =
111    try
112      Some
113        (List.find (fun l -> l.Syndic.Atom.rel = Syndic.Atom.Alternate) e.links)
114          .href
115    with Not_found -> (
116      Log.debug (fun m -> m "No alternate link found, trying fallback");
117      match e.links with
118      | l :: _ -> Some l.href
119      | [] -> (
120          match Uri.scheme e.id with
121          | Some "http" -> Some e.id
122          | Some "https" -> Some e.id
123          | _ -> None))
124  in
125  let date =
126    match e.published with Some _ -> e.published | None -> Some e.updated
127  in
128  let content =
129    match e.content with
130    | Some (Text s) -> html_of_text s
131    | Some (Html (xmlbase, s)) -> html_of_text ?xmlbase s
132    | Some (Xhtml (xmlbase, h)) -> html_of_syndic ?xmlbase h
133    | Some (Mime _) | Some (Src _) | None -> (
134        match e.summary with
135        | Some (Text s) -> html_of_text s
136        | Some (Html (xmlbase, s)) -> html_of_text ?xmlbase s
137        | Some (Xhtml (xmlbase, h)) -> html_of_syndic ?xmlbase h
138        | None -> Soup.parse "")
139  in
140  let is_valid_author_name name =
141    (* Filter out empty strings and placeholder values like "Unknown" *)
142    let trimmed = String.trim name in
143    trimmed <> "" && trimmed <> "Unknown"
144  in
145  let author_name =
146    (* Fallback chain for author:
147       1. Entry author (if present, not empty, and not "Unknown")
148       2. Feed-level author (from Atom feed metadata)
149       3. Feed title (from Atom feed metadata)
150       4. Source name (manually entered feed name) *)
151    try
152      let author, _ = e.authors in
153      let trimmed = String.trim author.name in
154      if is_valid_author_name author.name then trimmed
155      else raise Not_found (* Try feed-level author *)
156    with Not_found -> (
157      (* Feed content is now JSONFeed - try feed-level authors *)
158      let jsonfeed_content = Feed.content feed in
159      match Jsonfeed.authors jsonfeed_content.River_jsonfeed.feed with
160      | Some (first :: _) ->
161          let name = Jsonfeed.Author.name first |> Option.value ~default:"" in
162          if is_valid_author_name name then name
163          else Feed.title feed
164      | _ ->
165          (* Use feed title as fallback *)
166          Feed.title feed)
167  in
168  (* Extract tags from Atom categories *)
169  let tags =
170    List.map (fun cat -> cat.Syndic.Atom.term) e.categories
171  in
172  (* Extract summary - convert from text_construct to string *)
173  let summary =
174    match e.summary with
175    | Some s -> Some (Text_extract.string_of_text_construct s)
176    | None -> None
177  in
178  (* Generate unique ID *)
179  let guid = Uri.to_string e.id in
180  let title_str = Text_extract.string_of_text_construct e.title in
181  let id =
182    generate_id ~guid ?link ~title:title_str ?date
183      ~feed_url:(Source.url (Feed.source feed)) ()
184  in
185  {
186    id;
187    title = title_str;
188    link;
189    date;
190    feed;
191    author = author_name;
192    email = "";
193    content;
194    link_response = None;
195    tags;
196    summary;
197  }
198
199let post_of_rss2 ~(feed : Feed.t) it =
200  let title, content =
201    match it.Syndic.Rss2.story with
202    | All (t, xmlbase, d) -> (
203        ( t,
204          match it.content with
205          | _, "" -> html_of_text ?xmlbase d
206          | xmlbase, c -> html_of_text ?xmlbase c ))
207    | Title t ->
208        let xmlbase, c = it.content in
209        (t, html_of_text ?xmlbase c)
210    | Description (xmlbase, d) -> (
211        ( "",
212          match it.content with
213          | _, "" -> html_of_text ?xmlbase d
214          | xmlbase, c -> html_of_text ?xmlbase c ))
215  in
216  (* Note: it.link is of type Uri.t option in Syndic *)
217  let link =
218    match (it.guid, it.link) with
219    | Some u, _ when u.permalink -> Some u.data
220    | _, Some _ -> it.link
221    | Some u, _ ->
222        (* Sometimes the guid is indicated with isPermaLink="false" but is
223           nonetheless the only URL we get (e.g. ocamlpro). *)
224        Some u.data
225    | None, None -> None
226  in
227  (* Extract GUID string for ID generation *)
228  let guid_str =
229    match it.guid with
230    | Some u -> Some (Uri.to_string u.data)
231    | None -> None
232  in
233  (* RSS2 doesn't have a categories field exposed, use empty list *)
234  let tags = [] in
235  (* RSS2 doesn't have a separate summary field, so leave it empty *)
236  let summary = None in
237  (* Generate unique ID *)
238  let id =
239    generate_id ?guid:guid_str ?link ~title ?date:it.pubDate
240      ~feed_url:(Source.url (Feed.source feed)) ()
241  in
242  {
243    id;
244    title;
245    link;
246    feed;
247    author = Source.name (Feed.source feed);
248    email = string_of_option it.author;
249    content;
250    date = it.pubDate;
251    link_response = None;
252    tags;
253    summary;
254  }
255
256let post_of_jsonfeed_item ~(feed : Feed.t) (item : Jsonfeed.Item.t) =
257  Log.debug (fun m -> m "Processing JSONFeed item: %s"
258    (Option.value (Jsonfeed.Item.title item) ~default:"Untitled"));
259
260  (* Extract content - prefer HTML, fall back to text *)
261  let content =
262    match Jsonfeed.Item.content item with
263    | `Html html -> html_of_text html
264    | `Text text -> html_of_text text
265    | `Both (html, _text) -> html_of_text html
266  in
267
268  (* Extract author - use first author if multiple *)
269  let author_name, author_email =
270    match Jsonfeed.Item.authors item with
271    | Some (first :: _) ->
272        let name = Jsonfeed.Author.name first |> Option.value ~default:"" in
273        (* JSONFeed authors don't typically have email *)
274        (name, "")
275    | _ ->
276        (* Fall back to feed-level authors or feed title *)
277        let jsonfeed_content = Feed.content feed in
278        (match Jsonfeed.authors jsonfeed_content.River_jsonfeed.feed with
279         | Some (first :: _) ->
280             let name = Jsonfeed.Author.name first |> Option.value ~default:(Feed.title feed) in
281             (name, "")
282         | _ -> (Feed.title feed, ""))
283  in
284
285  (* Link - use url field *)
286  let link =
287    Jsonfeed.Item.url item
288    |> Option.map Uri.of_string
289  in
290
291  (* Date *)
292  let date = Jsonfeed.Item.date_published item in
293
294  (* Summary *)
295  let summary = Jsonfeed.Item.summary item in
296
297  (* Tags *)
298  let tags =
299    Jsonfeed.Item.tags item
300    |> Option.value ~default:[]
301  in
302
303  (* Generate unique ID - JSONFeed items always have an id field (required) *)
304  let guid = Jsonfeed.Item.id item in
305  let title_str = Jsonfeed.Item.title item |> Option.value ~default:"Untitled" in
306  let id =
307    generate_id ~guid ?link ~title:title_str ?date
308      ~feed_url:(Source.url (Feed.source feed)) ()
309  in
310
311  {
312    id;
313    title = title_str;
314    link;
315    date;
316    feed;
317    author = author_name;
318    email = author_email;
319    content;
320    link_response = None;
321    tags;
322    summary;
323  }
324
325let posts_of_feed c =
326  (* Feed content is now always JSONFeed *)
327  let jsonfeed_content = Feed.content c in
328  let items = Jsonfeed.items jsonfeed_content.River_jsonfeed.feed in
329  let posts = List.map (post_of_jsonfeed_item ~feed:c) items in
330  Log.debug (fun m -> m "Extracted %d posts from feed '%s' (converted to JSONFeed)"
331    (List.length posts) (Source.name (Feed.source c)));
332  posts
333
334let get_posts ?n ?(ofs = 0) planet_feeds =
335  Log.info (fun m -> m "Processing %d feeds for posts" (List.length planet_feeds));
336
337  let posts = List.concat @@ List.map posts_of_feed planet_feeds in
338  Log.debug (fun m -> m "Total posts collected: %d" (List.length posts));
339
340  let posts = List.sort post_compare posts in
341  Log.debug (fun m -> m "Posts sorted by date (most recent first)");
342
343  let posts = remove ofs posts in
344  let result =
345    match n with
346    | None ->
347        Log.debug (fun m -> m "Returning all %d posts (offset=%d)"
348          (List.length posts) ofs);
349        posts
350    | Some n ->
351        let limited = take n posts in
352        Log.debug (fun m -> m "Returning %d posts (requested=%d, offset=%d)"
353          (List.length limited) n ofs);
354        limited
355  in
356  result
357
358let of_feeds feeds = get_posts feeds
359
360let feed t = t.feed
361let title t = t.title
362let link t = t.link
363let date t = t.date
364let author t = t.author
365let email t = t.email
366let content t = Soup.to_string t.content
367let id t = t.id
368let tags t = t.tags
369let summary t = t.summary
370
371let meta_description _t =
372  (* TODO: This requires environment for HTTP access *)
373  Log.debug (fun m -> m "meta_description not implemented (requires environment)");
374  None
375
376let seo_image _t =
377  (* TODO: This requires environment for HTTP access *)
378  Log.debug (fun m -> m "seo_image not implemented (requires environment)");
379  None