My agentic slop goes here. Not intended for anyone else!
at main 21 kB view raw
1(* 2 * Copyright (c) 2014, OCaml.org project 3 * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 *) 17 18(** River RSS/Atom/JSONFeed aggregator library *) 19 20(** {1 Feed Sources} *) 21 22module Source : sig 23 type t 24 (** A feed source with name and URL. *) 25 26 val make : name:string -> url:string -> t 27 (** [make ~name ~url] creates a new feed source. *) 28 29 val name : t -> string 30 (** [name source] returns the feed name/label. *) 31 32 val url : t -> string 33 (** [url source] returns the feed URL. *) 34 35 val jsont : t Jsont.t 36 (** JSON codec for sources. *) 37end 38 39(** {1 HTTP Session Management} *) 40 41module Session : sig 42 type t 43 (** An abstract HTTP session for fetching feeds. 44 45 The session manages HTTP connections and is tied to an Eio switch 46 for proper resource cleanup. *) 47 48 val init : 49 sw:Eio.Switch.t -> 50 < clock : float Eio.Time.clock_ty Eio.Resource.t; 51 fs : Eio.Fs.dir_ty Eio.Path.t; 52 net : [ `Generic | `Unix ] Eio.Net.ty Eio.Resource.t; .. > -> 53 t 54 (** [init ~sw env] creates a new HTTP session. 55 56 The session is configured with appropriate defaults for fetching feeds: 57 - User-Agent: "OCaml-River/1.0" 58 - Automatic redirect following (max 5 redirects) 59 - TLS verification enabled 60 61 @param sw The switch for resource management 62 @param env The Eio environment *) 63 64 val with_session : 65 < clock : float Eio.Time.clock_ty Eio.Resource.t; 66 fs : Eio.Fs.dir_ty Eio.Path.t; 67 net : [ `Generic | `Unix ] Eio.Net.ty Eio.Resource.t; .. > -> 68 (t -> 'a) -> 'a 69 (** [with_session env f] creates a session and automatically manages its lifecycle. 70 71 This is the recommended way to use River as it ensures proper cleanup. 72 73 @param env The Eio environment 74 @param f The function to run with the session *) 75end 76 77(** {1 Feeds and Posts} *) 78 79module Feed : sig 80 type t 81 (** An Atom, RSS2, or JSON Feed. *) 82 83 val fetch : Session.t -> Source.t -> t 84 (** [fetch session source] fetches and parses a feed from the given source. 85 86 @param session The HTTP session 87 @param source The feed source to fetch 88 @raise Failure if the feed cannot be fetched or parsed *) 89 90 val source : t -> Source.t 91 (** [source feed] returns the source this feed was fetched from. *) 92end 93 94module Post : sig 95 type t 96 (** A post from a feed. *) 97 98 val of_feeds : Feed.t list -> t list 99 (** [of_feeds feeds] extracts and deduplicates posts from the given feeds. 100 101 Posts are deduplicated by ID. *) 102 103 val feed : t -> Feed.t 104 (** [feed post] returns the feed this post originated from. *) 105 106 val title : t -> string 107 (** [title post] returns the post title. *) 108 109 val link : t -> Uri.t option 110 (** [link post] returns the post link. *) 111 112 val date : t -> Syndic.Date.t option 113 (** [date post] returns the post date. *) 114 115 val author : t -> string 116 (** [author post] returns the post author name. *) 117 118 val email : t -> string 119 (** [email post] returns the post author email. *) 120 121 val content : t -> string 122 (** [content post] returns the post content. *) 123 124 val id : t -> string 125 (** [id post] returns the unique identifier of the post. *) 126 127 val tags : t -> string list 128 (** [tags post] returns the list of tags associated with the post. *) 129 130 val summary : t -> string option 131 (** [summary post] returns the summary/excerpt of the post, if available. *) 132 133 val meta_description : t -> string option 134 (** [meta_description post] returns the meta description from the origin site. 135 136 To get the meta description, we fetch the content of [link post] and look 137 for an HTML meta tag with name "description" or "og:description". *) 138 139 val seo_image : t -> string option 140 (** [seo_image post] returns the social media image URL. 141 142 To get the SEO image, we fetch the content of [link post] and look for an 143 HTML meta tag with name "og:image" or "twitter:image". *) 144end 145 146(** {1 Format Conversion and Export} *) 147 148module Format : sig 149 (** Feed format conversion and export. *) 150 151 module Atom : sig 152 (** Atom 1.0 format support. *) 153 154 val entry_of_post : Post.t -> Syndic.Atom.entry 155 (** [entry_of_post post] converts a post to an Atom entry. *) 156 157 val entries_of_posts : Post.t list -> Syndic.Atom.entry list 158 (** [entries_of_posts posts] converts posts to Atom entries. *) 159 160 val feed_of_entries : 161 title:string -> 162 ?id:string -> 163 ?authors:(string * string option) list -> 164 Syndic.Atom.entry list -> 165 Syndic.Atom.feed 166 (** [feed_of_entries ~title entries] creates an Atom feed from entries. 167 168 @param title The feed title 169 @param id Optional feed ID (default: "urn:river:merged") 170 @param authors Optional list of (name, email) tuples *) 171 172 val to_string : Syndic.Atom.feed -> string 173 (** [to_string feed] serializes an Atom feed to XML string. *) 174 end 175 176 module Rss2 : sig 177 (** RSS 2.0 format support. *) 178 179 val of_feed : Feed.t -> Syndic.Rss2.channel option 180 (** [of_feed feed] extracts RSS2 channel if the feed is RSS2 format. 181 182 Returns None if the feed is not RSS2. *) 183 end 184 185 module Jsonfeed : sig 186 (** JSON Feed 1.1 format support. *) 187 188 val item_of_post : Post.t -> Jsonfeed.Item.t 189 (** [item_of_post post] converts a post to a JSONFeed item. *) 190 191 val items_of_posts : Post.t list -> Jsonfeed.Item.t list 192 (** [items_of_posts posts] converts posts to JSONFeed items. *) 193 194 val feed_of_items : 195 title:string -> 196 ?home_page_url:string -> 197 ?feed_url:string -> 198 ?description:string -> 199 ?icon:string -> 200 ?favicon:string -> 201 Jsonfeed.Item.t list -> 202 Jsonfeed.t 203 (** [feed_of_items ~title items] creates a JSONFeed from items. 204 205 @param title The feed title (required) 206 @param home_page_url The URL of the website the feed represents 207 @param feed_url The URL of the feed itself 208 @param description A description of the feed 209 @param icon URL of an icon for the feed (512x512 recommended) 210 @param favicon URL of a favicon for the feed (64x64 recommended) *) 211 212 val feed_of_posts : 213 title:string -> 214 ?home_page_url:string -> 215 ?feed_url:string -> 216 ?description:string -> 217 ?icon:string -> 218 ?favicon:string -> 219 Post.t list -> 220 Jsonfeed.t 221 (** [feed_of_posts ~title posts] creates a JSONFeed from posts. 222 223 Convenience function that combines [items_of_posts] and [feed_of_items]. *) 224 225 val to_string : ?minify:bool -> Jsonfeed.t -> (string, string) result 226 (** [to_string ?minify feed] serializes a JSONFeed to JSON string. 227 228 @param minify If true, output compact JSON; if false, pretty-print (default: false) *) 229 230 val of_feed : Feed.t -> Jsonfeed.t option 231 (** [of_feed feed] extracts JSONFeed if the feed is JSONFeed format. 232 233 Returns None if the feed is not JSONFeed. *) 234 end 235end 236 237(** {1 JSONFeed with Atom Extensions} *) 238 239module River_jsonfeed : sig 240 (** JSONFeed with Atom extension support for River. 241 242 This module provides conversion between Atom feeds and JSONFeed format, 243 with custom extensions to preserve Atom-specific metadata that doesn't 244 have direct JSONFeed equivalents. 245 246 The extensions follow the JSONFeed specification for custom fields: 247 - Prefixed with underscore + letter: [_atom] 248 - Contains [about] field with documentation URL 249 - Feed readers can safely ignore unknown extensions 250 251 See: https://www.jsonfeed.org/mappingrssandatom/ *) 252 253 (** {2 Extension Types} *) 254 255 type category = { 256 term : string; (** Category term (required in Atom) *) 257 scheme : string option; (** Category scheme/domain *) 258 label : string option; (** Human-readable label *) 259 } 260 261 type contributor = { 262 contributor_name : string; 263 contributor_uri : string option; 264 contributor_email : string option; 265 } 266 267 type generator = { 268 generator_name : string; (** Generator name *) 269 generator_uri : string option; (** Generator URI *) 270 generator_version : string option; (** Generator version *) 271 } 272 273 type source = { 274 source_id : string; (** Source feed ID *) 275 source_title : string; (** Source feed title *) 276 source_updated : Ptime.t; (** Source feed update time *) 277 } 278 279 type content_type = 280 | Text (** Plain text *) 281 | Html (** HTML content *) 282 | Xhtml (** XHTML content *) 283 284 type feed_extension = { 285 feed_subtitle : string option; 286 feed_id : string; 287 feed_categories : category list; 288 feed_contributors : contributor list; 289 feed_generator : generator option; 290 feed_rights : string option; 291 feed_logo : string option; 292 } 293 294 type item_extension = { 295 item_id : string; 296 item_published : Ptime.t option; 297 item_contributors : contributor list; 298 item_source : source option; 299 item_rights : string option; 300 item_categories : category list; 301 item_content_type : content_type option; 302 } 303 304 type t = { 305 feed : Jsonfeed.t; 306 extension : feed_extension option; 307 } 308 309 type item = { 310 item : Jsonfeed.Item.t; 311 extension : item_extension option; 312 } 313 314 (** {2 Conversion from Atom} *) 315 316 val of_atom : Syndic.Atom.feed -> t 317 (** [of_atom feed] converts an Atom feed to JSONFeed with extensions. 318 319 All Atom metadata is preserved using extensions. *) 320 321 val item_of_atom : Syndic.Atom.entry -> item 322 (** [item_of_atom entry] converts an Atom entry to JSONFeed item with extensions. *) 323 324 (** {2 Conversion from RSS} *) 325 326 val of_rss2 : Syndic.Rss2.channel -> t 327 (** [of_rss2 channel] converts an RSS2 channel to JSONFeed. *) 328 329 val item_of_rss2 : Syndic.Rss2.item -> item 330 (** [item_of_rss2 item] converts an RSS2 item to JSONFeed item. *) 331 332 (** {2 Conversion to Atom} *) 333 334 val to_atom : t -> Syndic.Atom.feed 335 (** [to_atom t] converts JSONFeed with extensions back to Atom feed. 336 337 All original Atom metadata is restored from extensions. *) 338 339 val item_to_atom : item -> Syndic.Atom.entry 340 (** [item_to_atom item] converts JSONFeed item with extensions back to Atom entry. *) 341 342 (** {2 Serialization} *) 343 344 val to_string : ?minify:bool -> t -> (string, string) result 345 (** [to_string ?minify t] serializes to JSON string with extensions. *) 346 347 val of_string : string -> (t, string) result 348 (** [of_string s] parses JSON string with extensions. *) 349 350 (** {2 Utilities} *) 351 352 val of_posts : title:string -> Post.t list -> t 353 (** [of_posts ~title posts] creates JSONFeed from Post list with Atom extensions. *) 354 355 val to_posts : feed:Feed.t -> t -> Post.t list 356 (** [to_posts ~feed t] extracts posts from extended JSONFeed. *) 357end 358 359(** {1 Category Management} *) 360 361module Category : sig 362 (** Custom categories for organizing posts. 363 364 Categories are manually defined and can be assigned to posts for 365 organization and filtering. This is separate from feed-extracted tags. *) 366 367 type t 368 (** A custom category with metadata. *) 369 370 val create : 371 id:string -> 372 name:string -> 373 ?description:string -> 374 unit -> 375 t 376 (** [create ~id ~name ?description ()] creates a new category. 377 378 @param id Unique identifier for the category (e.g., "ocaml-projects") 379 @param name Display name (e.g., "OCaml Projects") 380 @param description Optional longer description *) 381 382 val id : t -> string 383 (** [id category] returns the unique identifier of the category. *) 384 385 val name : t -> string 386 (** [name category] returns the display name of the category. *) 387 388 val description : t -> string option 389 (** [description category] returns the description, if any. *) 390 391 val to_json : t -> Jsont.json 392 (** [to_json category] serializes a category to JSON. *) 393 394 val of_json : Jsont.json -> (t, string) result 395 (** [of_json json] deserializes a category from JSON. *) 396 397 val jsont : t Jsont.t 398 (** Jsont codec for categories. *) 399end 400 401(** {1 User Management} *) 402 403module User : sig 404 (** River user composed from Sortal contact data + sync state. 405 406 User data is stored in Sortal and read on-demand. River only persists 407 sync timestamps and optional per-user overrides. *) 408 409 type t 410 (** A River user composed from Sortal.Contact + sync metadata. *) 411 412 val of_contact : Sortal.Contact.t -> ?last_synced:string -> unit -> t 413 (** [of_contact contact ()] creates a River user from a Sortal contact. 414 415 @param contact The Sortal contact to base this user on 416 @param last_synced Optional ISO 8601 timestamp of last sync *) 417 418 val username : t -> string 419 (** [username user] returns the username (from Sortal.Contact.handle). *) 420 421 val fullname : t -> string 422 (** [fullname user] returns the full name (from Sortal.Contact.primary_name). *) 423 424 val email : t -> string option 425 (** [email user] returns the email address (from Sortal.Contact). *) 426 427 val feeds : t -> Source.t list 428 (** [feeds user] returns the list of subscribed feeds (from Sortal.Contact). *) 429 430 val last_synced : t -> string option 431 (** [last_synced user] returns the last sync timestamp if set. *) 432 433 val contact : t -> Sortal.Contact.t 434 (** [contact user] returns the underlying Sortal contact. *) 435 436 val set_last_synced : t -> string -> t 437 (** [set_last_synced user timestamp] returns a new user with updated sync time. *) 438end 439 440(** {1 Feed Quality Analysis} *) 441 442module Quality : sig 443 type t 444 (** Quality metrics for a feed or user's aggregated feed. *) 445 446 val make : 447 total_entries:int -> 448 entries_with_summary:int -> 449 entries_with_author:int -> 450 entries_with_date:int -> 451 entries_with_content:int -> 452 entries_with_tags:int -> 453 avg_content_length:float -> 454 min_content_length:int -> 455 max_content_length:int -> 456 posting_frequency_days:float option -> 457 quality_score:float -> 458 t 459 (** [make ~total_entries ...] creates quality metrics. *) 460 461 val total_entries : t -> int 462 val entries_with_summary : t -> int 463 val entries_with_author : t -> int 464 val entries_with_date : t -> int 465 val entries_with_content : t -> int 466 val entries_with_tags : t -> int 467 val avg_content_length : t -> float 468 val min_content_length : t -> int 469 val max_content_length : t -> int 470 val posting_frequency_days : t -> float option 471 val quality_score : t -> float 472 (** Accessors for quality metrics. *) 473 474 val analyze : Syndic.Atom.entry list -> t 475 (** [analyze entries] computes quality metrics from Atom entries. 476 477 The quality score is a weighted average of: 478 - Content completeness (40%) 479 - Metadata completeness (30%) 480 - Content richness (30%) *) 481end 482 483(** {1 State Management} *) 484 485module State : sig 486 type t 487 (** State handle for managing sync state and feeds on disk. 488 489 User contact data is read from Sortal on-demand. River only persists 490 sync timestamps and feed data. *) 491 492 val create : 493 < fs : Eio.Fs.dir_ty Eio.Path.t; .. > -> 494 app_name:string -> 495 t 496 (** [create env ~app_name] creates a state handle using XDG directories. 497 498 Data is stored in: 499 - Sync state: $XDG_STATE_HOME/[app_name]/sync_state.json 500 - Feeds: $XDG_STATE_HOME/[app_name]/feeds/[username]/ 501 502 User contact data is read from Sortal's XDG location. 503 504 @param env The Eio environment with filesystem access 505 @param app_name Application name for XDG paths *) 506 507 (** {2 User Operations} *) 508 509 val get_user : t -> username:string -> User.t option 510 (** [get_user state ~username] retrieves a user by username. 511 512 This reads contact data from Sortal and combines it with River's sync state. 513 Returns [None] if the username doesn't exist in Sortal or has no feeds. *) 514 515 val list_users : t -> string list 516 (** [list_users state] returns all usernames with feeds from Sortal. *) 517 518 val get_all_users : t -> User.t list 519 (** [get_all_users state] returns all users from Sortal with their sync state. *) 520 521 val update_sync_state : t -> username:string -> timestamp:string -> (unit, string) result 522 (** [update_sync_state state ~username ~timestamp] updates the last sync timestamp. 523 524 @param username The user to update 525 @param timestamp ISO 8601 timestamp of the sync *) 526 527 (** {2 Feed Operations} *) 528 529 val sync_user : 530 < clock : float Eio.Time.clock_ty Eio.Resource.t; 531 fs : Eio.Fs.dir_ty Eio.Path.t; 532 net : [ `Generic | `Unix ] Eio.Net.ty Eio.Resource.t; .. > -> 533 t -> 534 username:string -> 535 (unit, string) result 536 (** [sync_user env state ~username] fetches all feeds for the user and stores merged result. 537 538 Posts are fetched concurrently and merged with existing posts. 539 The result is stored as an Atom feed. *) 540 541 val sync_all : 542 < clock : float Eio.Time.clock_ty Eio.Resource.t; 543 fs : Eio.Fs.dir_ty Eio.Path.t; 544 net : [ `Generic | `Unix ] Eio.Net.ty Eio.Resource.t; .. > -> 545 t -> 546 (int * int, string) result 547 (** [sync_all env state] syncs all users concurrently. 548 549 Returns [Ok (success_count, fail_count)]. *) 550 551 val get_user_posts : 552 t -> 553 username:string -> 554 ?limit:int -> 555 unit -> 556 Syndic.Atom.entry list 557 (** [get_user_posts state ~username ()] retrieves stored posts for a user. 558 559 @param limit Optional maximum number of posts to return *) 560 561 val get_all_posts : 562 t -> 563 ?limit:int -> 564 unit -> 565 (string * Syndic.Atom.entry) list 566 (** [get_all_posts state ()] retrieves posts from all users, sorted by date. 567 568 Returns list of (username, entry) tuples. 569 @param limit Optional maximum number of posts to return *) 570 571 (** {2 Export} *) 572 573 val export_merged_feed : 574 t -> 575 title:string -> 576 format:[ `Atom | `Jsonfeed ] -> 577 ?limit:int -> 578 unit -> 579 (string, string) result 580 (** [export_merged_feed state ~title ~format ()] exports a merged feed of all users. 581 582 @param title Feed title 583 @param format Output format 584 @param limit Optional maximum number of entries *) 585 586 val export_html_site : 587 t -> 588 output_dir:Eio.Fs.dir_ty Eio.Path.t -> 589 title:string -> 590 ?posts_per_page:int -> 591 unit -> 592 (unit, string) result 593 (** [export_html_site state ~output_dir ~title ()] exports a static HTML site. 594 595 Generates a complete static site with: 596 - Paginated post listings 597 - Author index and individual author pages 598 - Category index and individual category pages 599 - Links page showing all outgoing links from posts 600 601 @param output_dir Directory to write HTML files to 602 @param title Site title 603 @param posts_per_page Number of posts per page (default: 25) *) 604 605 (** {2 Category Management} *) 606 607 val list_categories : t -> Category.t list 608 (** [list_categories state] returns all custom categories. *) 609 610 val get_category : t -> id:string -> Category.t option 611 (** [get_category state ~id] retrieves a category by ID. *) 612 613 val add_category : t -> Category.t -> (unit, string) result 614 (** [add_category state category] adds or updates a category. 615 616 @param category The category to add/update *) 617 618 val remove_category : t -> id:string -> (unit, string) result 619 (** [remove_category state ~id] removes a category. 620 621 This also removes the category from any posts that were tagged with it. 622 @param id The category ID to remove *) 623 624 val get_post_categories : t -> post_id:string -> string list 625 (** [get_post_categories state ~post_id] returns the list of category IDs 626 assigned to a post. *) 627 628 val set_post_categories : t -> post_id:string -> category_ids:string list -> (unit, string) result 629 (** [set_post_categories state ~post_id ~category_ids] sets the categories for a post. 630 631 Replaces any existing category assignments for this post. 632 @param post_id The post ID to categorize 633 @param category_ids List of category IDs to assign *) 634 635 val add_post_category : t -> post_id:string -> category_id:string -> (unit, string) result 636 (** [add_post_category state ~post_id ~category_id] adds a category to a post. 637 638 @param post_id The post ID 639 @param category_id The category ID to add *) 640 641 val remove_post_category : t -> post_id:string -> category_id:string -> (unit, string) result 642 (** [remove_post_category state ~post_id ~category_id] removes a category from a post. 643 644 @param post_id The post ID 645 @param category_id The category ID to remove *) 646 647 val get_posts_by_category : t -> category_id:string -> string list 648 (** [get_posts_by_category state ~category_id] returns all post IDs with this category. *) 649 650 (** {2 Analysis} *) 651 652 val analyze_user_quality : 653 t -> 654 username:string -> 655 (Quality.t, string) result 656 (** [analyze_user_quality state ~username] analyzes quality metrics for a user's feed. *) 657end