My agentic slop goes here. Not intended for anyone else!
at main 12 kB view raw
1(* 2 * Copyright (c) 2014, OCaml.org project 3 * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 *) 17 18(** Post representation and extraction from feeds. *) 19 20let src = Logs.Src.create "river" ~doc:"River RSS/Atom aggregator" 21module Log = (val Logs.src_log src : Logs.LOG) 22 23type t = { 24 id : string; 25 title : string; 26 link : Uri.t option; 27 date : Syndic.Date.t option; 28 feed : Feed.t; 29 author : string; 30 email : string; 31 content : Soup.soup Soup.node; 32 mutable link_response : (string, string) result option; 33 tags : string list; 34 summary : string option; 35} 36 37(** Generate a stable, unique ID from available data *) 38let generate_id ?guid ?link ?title ?date ~feed_url () = 39 match guid with 40 | Some id when id <> "" -> 41 (* Use explicit ID/GUID if available *) 42 id 43 | _ -> 44 match link with 45 | Some uri when Uri.to_string uri <> "" -> 46 (* Use permalink as ID (stable and unique) *) 47 Uri.to_string uri 48 | _ -> 49 (* Fallback: hash of feed_url + title + date *) 50 let title_str = Option.value title ~default:"" in 51 let date_str = 52 match date with 53 | Some d -> Ptime.to_rfc3339 d 54 | None -> "" 55 in 56 let composite = Printf.sprintf "%s|%s|%s" feed_url title_str date_str in 57 (* Use SHA256 for stable hashing *) 58 Digest.string composite |> Digest.to_hex 59 60let resolve_links_attr ~xmlbase attr el = 61 Soup.R.attribute attr el 62 |> Uri.of_string 63 |> Syndic.XML.resolve ~xmlbase 64 |> Uri.to_string 65 |> fun value -> Soup.set_attribute attr value el 66 67(* Things that posts should not contain *) 68let undesired_tags = [ "style"; "script" ] 69let undesired_attr = [ "id" ] 70 71let html_of_text ?xmlbase s = 72 let soup = Soup.parse s in 73 let ($$) = Soup.($$) in 74 soup $$ "a[href]" |> Soup.iter (resolve_links_attr ~xmlbase "href"); 75 soup $$ "img[src]" |> Soup.iter (resolve_links_attr ~xmlbase "src"); 76 undesired_tags |> List.iter (fun tag -> soup $$ tag |> Soup.iter Soup.delete); 77 soup $$ "*" |> Soup.iter (fun el -> 78 undesired_attr |> List.iter (fun attr -> Soup.delete_attribute attr el)); 79 soup 80 81(* Do not trust sites using XML for HTML content. Convert to string and parse 82 back. (Does not always fix bad HTML unfortunately.) *) 83let html_of_syndic = 84 let ns_prefix _ = Some "" in 85 fun ?xmlbase h -> 86 html_of_text ?xmlbase 87 (String.concat "" (List.map (Syndic.XML.to_string ~ns_prefix) h)) 88 89let string_of_option = function None -> "" | Some s -> s 90 91let post_compare p1 p2 = 92 (* Most recent posts first. Posts with no date are always last *) 93 match (p1.date, p2.date) with 94 | Some d1, Some d2 -> Syndic.Date.compare d2 d1 95 | None, Some _ -> 1 96 | Some _, None -> -1 97 | None, None -> 1 98 99let rec remove n l = 100 if n <= 0 then l else match l with [] -> [] | _ :: tl -> remove (n - 1) tl 101 102let rec take n = function 103 | [] -> [] 104 | e :: tl -> if n > 0 then e :: take (n - 1) tl else [] 105 106let post_of_atom ~(feed : Feed.t) (e : Syndic.Atom.entry) = 107 Log.debug (fun m -> m "Processing Atom entry: %s" 108 (Text_extract.string_of_text_construct e.title)); 109 110 let link = 111 try 112 Some 113 (List.find (fun l -> l.Syndic.Atom.rel = Syndic.Atom.Alternate) e.links) 114 .href 115 with Not_found -> ( 116 Log.debug (fun m -> m "No alternate link found, trying fallback"); 117 match e.links with 118 | l :: _ -> Some l.href 119 | [] -> ( 120 match Uri.scheme e.id with 121 | Some "http" -> Some e.id 122 | Some "https" -> Some e.id 123 | _ -> None)) 124 in 125 let date = 126 match e.published with Some _ -> e.published | None -> Some e.updated 127 in 128 let content = 129 match e.content with 130 | Some (Text s) -> html_of_text s 131 | Some (Html (xmlbase, s)) -> html_of_text ?xmlbase s 132 | Some (Xhtml (xmlbase, h)) -> html_of_syndic ?xmlbase h 133 | Some (Mime _) | Some (Src _) | None -> ( 134 match e.summary with 135 | Some (Text s) -> html_of_text s 136 | Some (Html (xmlbase, s)) -> html_of_text ?xmlbase s 137 | Some (Xhtml (xmlbase, h)) -> html_of_syndic ?xmlbase h 138 | None -> Soup.parse "") 139 in 140 let is_valid_author_name name = 141 (* Filter out empty strings and placeholder values like "Unknown" *) 142 let trimmed = String.trim name in 143 trimmed <> "" && trimmed <> "Unknown" 144 in 145 let author_name = 146 (* Fallback chain for author: 147 1. Entry author (if present, not empty, and not "Unknown") 148 2. Feed-level author (from Atom feed metadata) 149 3. Feed title (from Atom feed metadata) 150 4. Source name (manually entered feed name) *) 151 try 152 let author, _ = e.authors in 153 let trimmed = String.trim author.name in 154 if is_valid_author_name author.name then trimmed 155 else raise Not_found (* Try feed-level author *) 156 with Not_found -> ( 157 (* Feed content is now JSONFeed - try feed-level authors *) 158 let jsonfeed_content = Feed.content feed in 159 match Jsonfeed.authors jsonfeed_content.River_jsonfeed.feed with 160 | Some (first :: _) -> 161 let name = Jsonfeed.Author.name first |> Option.value ~default:"" in 162 if is_valid_author_name name then name 163 else Feed.title feed 164 | _ -> 165 (* Use feed title as fallback *) 166 Feed.title feed) 167 in 168 (* Extract tags from Atom categories *) 169 let tags = 170 List.map (fun cat -> cat.Syndic.Atom.term) e.categories 171 in 172 (* Extract summary - convert from text_construct to string *) 173 let summary = 174 match e.summary with 175 | Some s -> Some (Text_extract.string_of_text_construct s) 176 | None -> None 177 in 178 (* Generate unique ID *) 179 let guid = Uri.to_string e.id in 180 let title_str = Text_extract.string_of_text_construct e.title in 181 let id = 182 generate_id ~guid ?link ~title:title_str ?date 183 ~feed_url:(Source.url (Feed.source feed)) () 184 in 185 { 186 id; 187 title = title_str; 188 link; 189 date; 190 feed; 191 author = author_name; 192 email = ""; 193 content; 194 link_response = None; 195 tags; 196 summary; 197 } 198 199let post_of_rss2 ~(feed : Feed.t) it = 200 let title, content = 201 match it.Syndic.Rss2.story with 202 | All (t, xmlbase, d) -> ( 203 ( t, 204 match it.content with 205 | _, "" -> html_of_text ?xmlbase d 206 | xmlbase, c -> html_of_text ?xmlbase c )) 207 | Title t -> 208 let xmlbase, c = it.content in 209 (t, html_of_text ?xmlbase c) 210 | Description (xmlbase, d) -> ( 211 ( "", 212 match it.content with 213 | _, "" -> html_of_text ?xmlbase d 214 | xmlbase, c -> html_of_text ?xmlbase c )) 215 in 216 (* Note: it.link is of type Uri.t option in Syndic *) 217 let link = 218 match (it.guid, it.link) with 219 | Some u, _ when u.permalink -> Some u.data 220 | _, Some _ -> it.link 221 | Some u, _ -> 222 (* Sometimes the guid is indicated with isPermaLink="false" but is 223 nonetheless the only URL we get (e.g. ocamlpro). *) 224 Some u.data 225 | None, None -> None 226 in 227 (* Extract GUID string for ID generation *) 228 let guid_str = 229 match it.guid with 230 | Some u -> Some (Uri.to_string u.data) 231 | None -> None 232 in 233 (* RSS2 doesn't have a categories field exposed, use empty list *) 234 let tags = [] in 235 (* RSS2 doesn't have a separate summary field, so leave it empty *) 236 let summary = None in 237 (* Generate unique ID *) 238 let id = 239 generate_id ?guid:guid_str ?link ~title ?date:it.pubDate 240 ~feed_url:(Source.url (Feed.source feed)) () 241 in 242 { 243 id; 244 title; 245 link; 246 feed; 247 author = Source.name (Feed.source feed); 248 email = string_of_option it.author; 249 content; 250 date = it.pubDate; 251 link_response = None; 252 tags; 253 summary; 254 } 255 256let post_of_jsonfeed_item ~(feed : Feed.t) (item : Jsonfeed.Item.t) = 257 Log.debug (fun m -> m "Processing JSONFeed item: %s" 258 (Option.value (Jsonfeed.Item.title item) ~default:"Untitled")); 259 260 (* Extract content - prefer HTML, fall back to text *) 261 let content = 262 match Jsonfeed.Item.content item with 263 | `Html html -> html_of_text html 264 | `Text text -> html_of_text text 265 | `Both (html, _text) -> html_of_text html 266 in 267 268 (* Extract author - use first author if multiple *) 269 let author_name, author_email = 270 match Jsonfeed.Item.authors item with 271 | Some (first :: _) -> 272 let name = Jsonfeed.Author.name first |> Option.value ~default:"" in 273 (* JSONFeed authors don't typically have email *) 274 (name, "") 275 | _ -> 276 (* Fall back to feed-level authors or feed title *) 277 let jsonfeed_content = Feed.content feed in 278 (match Jsonfeed.authors jsonfeed_content.River_jsonfeed.feed with 279 | Some (first :: _) -> 280 let name = Jsonfeed.Author.name first |> Option.value ~default:(Feed.title feed) in 281 (name, "") 282 | _ -> (Feed.title feed, "")) 283 in 284 285 (* Link - use url field *) 286 let link = 287 Jsonfeed.Item.url item 288 |> Option.map Uri.of_string 289 in 290 291 (* Date *) 292 let date = Jsonfeed.Item.date_published item in 293 294 (* Summary *) 295 let summary = Jsonfeed.Item.summary item in 296 297 (* Tags *) 298 let tags = 299 Jsonfeed.Item.tags item 300 |> Option.value ~default:[] 301 in 302 303 (* Generate unique ID - JSONFeed items always have an id field (required) *) 304 let guid = Jsonfeed.Item.id item in 305 let title_str = Jsonfeed.Item.title item |> Option.value ~default:"Untitled" in 306 let id = 307 generate_id ~guid ?link ~title:title_str ?date 308 ~feed_url:(Source.url (Feed.source feed)) () 309 in 310 311 { 312 id; 313 title = title_str; 314 link; 315 date; 316 feed; 317 author = author_name; 318 email = author_email; 319 content; 320 link_response = None; 321 tags; 322 summary; 323 } 324 325let posts_of_feed c = 326 (* Feed content is now always JSONFeed *) 327 let jsonfeed_content = Feed.content c in 328 let items = Jsonfeed.items jsonfeed_content.River_jsonfeed.feed in 329 let posts = List.map (post_of_jsonfeed_item ~feed:c) items in 330 Log.debug (fun m -> m "Extracted %d posts from feed '%s' (converted to JSONFeed)" 331 (List.length posts) (Source.name (Feed.source c))); 332 posts 333 334let get_posts ?n ?(ofs = 0) planet_feeds = 335 Log.info (fun m -> m "Processing %d feeds for posts" (List.length planet_feeds)); 336 337 let posts = List.concat @@ List.map posts_of_feed planet_feeds in 338 Log.debug (fun m -> m "Total posts collected: %d" (List.length posts)); 339 340 let posts = List.sort post_compare posts in 341 Log.debug (fun m -> m "Posts sorted by date (most recent first)"); 342 343 let posts = remove ofs posts in 344 let result = 345 match n with 346 | None -> 347 Log.debug (fun m -> m "Returning all %d posts (offset=%d)" 348 (List.length posts) ofs); 349 posts 350 | Some n -> 351 let limited = take n posts in 352 Log.debug (fun m -> m "Returning %d posts (requested=%d, offset=%d)" 353 (List.length limited) n ofs); 354 limited 355 in 356 result 357 358let of_feeds feeds = get_posts feeds 359 360let feed t = t.feed 361let title t = t.title 362let link t = t.link 363let date t = t.date 364let author t = t.author 365let email t = t.email 366let content t = Soup.to_string t.content 367let id t = t.id 368let tags t = t.tags 369let summary t = t.summary 370 371let meta_description _t = 372 (* TODO: This requires environment for HTTP access *) 373 Log.debug (fun m -> m "meta_description not implemented (requires environment)"); 374 None 375 376let seo_image _t = 377 (* TODO: This requires environment for HTTP access *) 378 Log.debug (fun m -> m "seo_image not implemented (requires environment)"); 379 None