My agentic slop goes here. Not intended for anyone else!
1(* 2 * Copyright (c) 2014, OCaml.org project 3 * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 *) 17 18let src = Logs.Src.create "river.post" ~doc:"River post processing" 19module Log = (val Logs.src_log src : Logs.LOG) 20 21type t = { 22 id : string; 23 title : string; 24 link : Uri.t option; 25 date : Syndic.Date.t option; 26 feed : Feed.t; 27 author : string; 28 email : string; 29 content : Soup.soup Soup.node; 30 mutable link_response : (string, string) result option; 31 tags : string list; 32 summary : string option; 33} 34 35(** Generate a stable, unique ID from available data *) 36let generate_id ?guid ?link ?title ?date ~feed_url () = 37 match guid with 38 | Some id when id <> "" -> 39 (* Use explicit ID/GUID if available *) 40 id 41 | _ -> 42 match link with 43 | Some uri when Uri.to_string uri <> "" -> 44 (* Use permalink as ID (stable and unique) *) 45 Uri.to_string uri 46 | _ -> 47 (* Fallback: hash of feed_url + title + date *) 48 let title_str = Option.value title ~default:"" in 49 let date_str = 50 match date with 51 | Some d -> Ptime.to_rfc3339 d 52 | None -> "" 53 in 54 let composite = Printf.sprintf "%s|%s|%s" feed_url title_str date_str in 55 (* Use SHA256 for stable hashing *) 56 Digest.string composite |> Digest.to_hex 57 58let post_id post = post.id 59 60let resolve_links_attr ~xmlbase attr el = 61 Soup.R.attribute attr el 62 |> Uri.of_string 63 |> Syndic.XML.resolve ~xmlbase 64 |> Uri.to_string 65 |> fun value -> Soup.set_attribute attr value el 66 67(* Things that posts should not contain *) 68let undesired_tags = [ "style"; "script" ] 69let undesired_attr = [ "id" ] 70 71let html_of_text ?xmlbase s = 72 let soup = Soup.parse s in 73 let ($$) = Soup.($$) in 74 soup $$ "a[href]" |> Soup.iter (resolve_links_attr ~xmlbase "href"); 75 soup $$ "img[src]" |> Soup.iter (resolve_links_attr ~xmlbase "src"); 76 undesired_tags |> List.iter (fun tag -> soup $$ tag |> Soup.iter Soup.delete); 77 soup $$ "*" |> Soup.iter (fun el -> 78 undesired_attr |> List.iter (fun attr -> Soup.delete_attribute attr el)); 79 soup 80 81(* Do not trust sites using XML for HTML content. Convert to string and parse 82 back. (Does not always fix bad HTML unfortunately.) *) 83let html_of_syndic = 84 let ns_prefix _ = Some "" in 85 fun ?xmlbase h -> 86 html_of_text ?xmlbase 87 (String.concat "" (List.map (Syndic.XML.to_string ~ns_prefix) h)) 88 89let string_of_option = function None -> "" | Some s -> s 90 91(* Email on the forge contain the name in parenthesis *) 92let forge_name_re = Str.regexp ".*(\\([^()]*\\))" 93 94let post_compare p1 p2 = 95 (* Most recent posts first. Posts with no date are always last *) 96 match (p1.date, p2.date) with 97 | Some d1, Some d2 -> Syndic.Date.compare d2 d1 98 | None, Some _ -> 1 99 | Some _, None -> -1 100 | None, None -> 1 101 102let rec remove n l = 103 if n <= 0 then l else match l with [] -> [] | _ :: tl -> remove (n - 1) tl 104 105let rec take n = function 106 | [] -> [] 107 | e :: tl -> if n > 0 then e :: take (n - 1) tl else [] 108 109(* Blog feed 110 ***********************************************************************) 111 112let post_of_atom ~(feed : Feed.t) (e : Syndic.Atom.entry) = 113 Log.debug (fun m -> m "Processing Atom entry: %s" 114 (Util.string_of_text_construct e.title)); 115 116 let link = 117 try 118 Some 119 (List.find (fun l -> l.Syndic.Atom.rel = Syndic.Atom.Alternate) e.links) 120 .href 121 with Not_found -> ( 122 Log.debug (fun m -> m "No alternate link found, trying fallback"); 123 match e.links with 124 | l :: _ -> Some l.href 125 | [] -> ( 126 match Uri.scheme e.id with 127 | Some "http" -> Some e.id 128 | Some "https" -> Some e.id 129 | _ -> None)) 130 in 131 let date = 132 match e.published with Some _ -> e.published | None -> Some e.updated 133 in 134 let content = 135 match e.content with 136 | Some (Text s) -> html_of_text s 137 | Some (Html (xmlbase, s)) -> html_of_text ?xmlbase s 138 | Some (Xhtml (xmlbase, h)) -> html_of_syndic ?xmlbase h 139 | Some (Mime _) | Some (Src _) | None -> ( 140 match e.summary with 141 | Some (Text s) -> html_of_text s 142 | Some (Html (xmlbase, s)) -> html_of_text ?xmlbase s 143 | Some (Xhtml (xmlbase, h)) -> html_of_syndic ?xmlbase h 144 | None -> Soup.parse "") 145 in 146 let is_valid_author_name name = 147 (* Filter out empty strings and placeholder values like "Unknown" *) 148 let trimmed = String.trim name in 149 trimmed <> "" && trimmed <> "Unknown" 150 in 151 let author_name = 152 (* Fallback chain for author: 153 1. Entry author (if present, not empty, and not "Unknown") 154 2. Feed-level author (from Atom feed metadata) 155 3. Feed title (from Atom feed metadata) 156 4. Source name (manually entered feed name) *) 157 try 158 let author, _ = e.authors in 159 let trimmed = String.trim author.name in 160 if is_valid_author_name author.name then trimmed 161 else raise Not_found (* Try feed-level author *) 162 with Not_found -> ( 163 match feed.content with 164 | Feed.Atom atom_feed -> ( 165 (* Try feed-level authors *) 166 match atom_feed.Syndic.Atom.authors with 167 | author :: _ when is_valid_author_name author.name -> 168 String.trim author.name 169 | _ -> 170 (* Use feed title *) 171 Util.string_of_text_construct atom_feed.Syndic.Atom.title) 172 | Feed.Rss2 _ | Feed.Json _ -> 173 (* For RSS2 and JSONFeed, use the feed name which is the source name *) 174 feed.name) 175 in 176 (* Extract tags from Atom categories *) 177 let tags = 178 List.map (fun cat -> cat.Syndic.Atom.term) e.categories 179 in 180 (* Extract summary - convert from text_construct to string *) 181 let summary = 182 match e.summary with 183 | Some s -> Some (Util.string_of_text_construct s) 184 | None -> None 185 in 186 (* Generate unique ID *) 187 let guid = Uri.to_string e.id in 188 let title_str = Util.string_of_text_construct e.title in 189 let id = 190 generate_id ~guid ?link ~title:title_str ?date ~feed_url:feed.url () 191 in 192 { 193 id; 194 title = title_str; 195 link; 196 date; 197 feed; 198 author = author_name; 199 email = ""; 200 content; 201 link_response = None; 202 tags; 203 summary; 204 } 205 206let post_of_rss2 ~(feed : Feed.t) it = 207 let title, content = 208 match it.Syndic.Rss2.story with 209 | All (t, xmlbase, d) -> ( 210 ( t, 211 match it.content with 212 | _, "" -> html_of_text ?xmlbase d 213 | xmlbase, c -> html_of_text ?xmlbase c )) 214 | Title t -> 215 let xmlbase, c = it.content in 216 (t, html_of_text ?xmlbase c) 217 | Description (xmlbase, d) -> ( 218 ( "", 219 match it.content with 220 | _, "" -> html_of_text ?xmlbase d 221 | xmlbase, c -> html_of_text ?xmlbase c )) 222 in 223 (* Note: it.link is of type Uri.t option in Syndic *) 224 let link = 225 match (it.guid, it.link) with 226 | Some u, _ when u.permalink -> Some u.data 227 | _, Some _ -> it.link 228 | Some u, _ -> 229 (* Sometimes the guid is indicated with isPermaLink="false" but is 230 nonetheless the only URL we get (e.g. ocamlpro). *) 231 Some u.data 232 | None, None -> None 233 in 234 (* Extract GUID string for ID generation *) 235 let guid_str = 236 match it.guid with 237 | Some u -> Some (Uri.to_string u.data) 238 | None -> None 239 in 240 (* RSS2 doesn't have a categories field exposed, use empty list *) 241 let tags = [] in 242 (* RSS2 doesn't have a separate summary field, so leave it empty *) 243 let summary = None in 244 (* Generate unique ID *) 245 let id = 246 generate_id ?guid:guid_str ?link ~title ?date:it.pubDate ~feed_url:feed.url () 247 in 248 { 249 id; 250 title; 251 link; 252 feed; 253 author = feed.name; 254 email = string_of_option it.author; 255 content; 256 date = it.pubDate; 257 link_response = None; 258 tags; 259 summary; 260 } 261 262let post_of_jsonfeed_item ~(feed : Feed.t) (item : Jsonfeed.Item.t) = 263 Log.debug (fun m -> m "Processing JSONFeed item: %s" 264 (Option.value (Jsonfeed.Item.title item) ~default:"Untitled")); 265 266 (* Extract content - prefer HTML, fall back to text *) 267 let content = 268 match Jsonfeed.Item.content item with 269 | `Html html -> html_of_text html 270 | `Text text -> html_of_text text 271 | `Both (html, _text) -> html_of_text html 272 in 273 274 (* Extract author - use first author if multiple *) 275 let author_name, author_email = 276 match Jsonfeed.Item.authors item with 277 | Some (first :: _) -> 278 let name = Jsonfeed.Author.name first |> Option.value ~default:"" in 279 (* JSONFeed authors don't typically have email *) 280 (name, "") 281 | _ -> 282 (* Fall back to feed-level authors or feed title *) 283 (match feed.content with 284 | Feed.Json jsonfeed -> 285 (match Jsonfeed.authors jsonfeed with 286 | Some (first :: _) -> 287 let name = Jsonfeed.Author.name first |> Option.value ~default:feed.title in 288 (name, "") 289 | _ -> (feed.title, "")) 290 | _ -> (feed.title, "")) 291 in 292 293 (* Link - use url field *) 294 let link = 295 Jsonfeed.Item.url item 296 |> Option.map Uri.of_string 297 in 298 299 (* Date *) 300 let date = Jsonfeed.Item.date_published item in 301 302 (* Summary *) 303 let summary = Jsonfeed.Item.summary item in 304 305 (* Tags *) 306 let tags = 307 Jsonfeed.Item.tags item 308 |> Option.value ~default:[] 309 in 310 311 (* Generate unique ID - JSONFeed items always have an id field (required) *) 312 let guid = Jsonfeed.Item.id item in 313 let title_str = Jsonfeed.Item.title item |> Option.value ~default:"Untitled" in 314 let id = 315 generate_id ~guid ?link ~title:title_str ?date ~feed_url:feed.url () 316 in 317 318 { 319 id; 320 title = title_str; 321 link; 322 date; 323 feed; 324 author = author_name; 325 email = author_email; 326 content; 327 link_response = None; 328 tags; 329 summary; 330 } 331 332let posts_of_feed c = 333 match c.Feed.content with 334 | Feed.Atom f -> 335 let posts = List.map (post_of_atom ~feed:c) f.Syndic.Atom.entries in 336 Log.debug (fun m -> m "Extracted %d posts from Atom feed '%s'" 337 (List.length posts) c.Feed.name); 338 posts 339 | Feed.Rss2 ch -> 340 let posts = List.map (post_of_rss2 ~feed:c) ch.Syndic.Rss2.items in 341 Log.debug (fun m -> m "Extracted %d posts from RSS2 feed '%s'" 342 (List.length posts) c.Feed.name); 343 posts 344 | Feed.Json jsonfeed -> 345 let items = Jsonfeed.items jsonfeed in 346 let posts = List.map (post_of_jsonfeed_item ~feed:c) items in 347 Log.debug (fun m -> m "Extracted %d posts from JSONFeed '%s'" 348 (List.length posts) c.Feed.name); 349 posts 350 351let mk_entry post = 352 let content = Syndic.Atom.Html (None, Soup.to_string post.content) in 353 let contributors = 354 [ Syndic.Atom.author ~uri:(Uri.of_string post.feed.url) post.feed.name ] 355 in 356 let links = 357 match post.link with 358 | Some l -> [ Syndic.Atom.link ~rel:Syndic.Atom.Alternate l ] 359 | None -> [] 360 in 361 (* TODO: include source *) 362 let id = 363 match post.link with 364 | Some l -> l 365 | None -> Uri.of_string (Digest.to_hex (Digest.string post.title)) 366 in 367 let authors = (Syndic.Atom.author ~email:post.email post.author, []) in 368 let title : Syndic.Atom.text_construct = Syndic.Atom.Text post.title in 369 let updated = 370 match post.date with 371 (* Atom entry requires a date but RSS2 does not. So if a date 372 * is not available, just capture the current date. *) 373 | None -> Ptime.of_float_s (Unix.gettimeofday ()) |> Option.get 374 | Some d -> d 375 in 376 Syndic.Atom.entry ~content ~contributors ~links ~id ~authors ~title ~updated 377 () 378 379let mk_entries posts = List.map mk_entry posts 380 381let mk_jsonfeed_item post = 382 (* Convert HTML content back to string *) 383 let html = Soup.to_string post.content in 384 let content = `Html html in 385 386 (* Create author *) 387 let authors = 388 if post.author <> "" then 389 let author = Jsonfeed.Author.create ~name:post.author () in 390 Some [author] 391 else 392 None 393 in 394 395 (* Create item *) 396 Jsonfeed.Item.create 397 ~id:post.id 398 ~content 399 ?url:(Option.map Uri.to_string post.link) 400 ~title:post.title 401 ?summary:post.summary 402 ?date_published:post.date 403 ?authors 404 ~tags:post.tags 405 () 406 407let mk_jsonfeed_items posts = List.map mk_jsonfeed_item posts 408 409let get_posts ?n ?(ofs = 0) planet_feeds = 410 Log.info (fun m -> m "Processing %d feeds for posts" (List.length planet_feeds)); 411 412 let posts = List.concat @@ List.map posts_of_feed planet_feeds in 413 Log.debug (fun m -> m "Total posts collected: %d" (List.length posts)); 414 415 let posts = List.sort post_compare posts in 416 Log.debug (fun m -> m "Posts sorted by date (most recent first)"); 417 418 let posts = remove ofs posts in 419 let result = 420 match n with 421 | None -> 422 Log.debug (fun m -> m "Returning all %d posts (offset=%d)" 423 (List.length posts) ofs); 424 posts 425 | Some n -> 426 let limited = take n posts in 427 Log.debug (fun m -> m "Returning %d posts (requested=%d, offset=%d)" 428 (List.length limited) n ofs); 429 limited 430 in 431 result 432 433(* Fetch the link response and cache it. *) 434(* TODO: This requires environment for HTTP access 435let fetch_link env t = 436 match (t.link, t.link_response) with 437 | None, _ -> None 438 | Some _, Some (Ok x) -> Some x 439 | Some _, Some (Error _) -> None 440 | Some link, None -> ( 441 try 442 let response = Http.get env (Uri.to_string link) in 443 t.link_response <- Some (Ok response); 444 Some response 445 with _exn -> 446 t.link_response <- Some (Error ""); 447 None) 448*) 449let fetch_link _ = None