···
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18
+
(** River RSS/Atom/JSONFeed aggregator library *)
let src = Logs.Src.create "river" ~doc:"River RSS/Atom aggregator"
module Log = (val Logs.src_log src : Logs.LOG)
21
-
(* Keep Client module internal *)
22
-
module Internal_client = Client
23
+
(** {1 Internal Utilities} *)
25
+
module Text_extract = struct
28
+
(* Remove all tags *)
29
+
let rec syndic_to_buffer b = function
30
+
| XML.Node (_, _, subs) -> List.iter (syndic_to_buffer b) subs
31
+
| XML.Data (_, d) -> Buffer.add_string b d
33
+
let syndic_to_string x =
34
+
let b = Buffer.create 1024 in
35
+
List.iter (syndic_to_buffer b) x;
38
+
let string_of_text_construct : Atom.text_construct -> string = function
39
+
| Atom.Text s | Atom.Html (_, s) -> s
40
+
| Atom.Xhtml (_, x) -> syndic_to_string x
43
+
module Html_meta = struct
44
+
[@@@warning "-32"] (* Suppress unused value warnings for internal utilities *)
46
+
(** This module determines an image to be used as preview of a website.
48
+
It does this by following the same logic Google+ and other websites use, and
49
+
described in this article:
50
+
https://www.raymondcamden.com/2011/07/26/How-are-Facebook-and-Google-creating-link-previews *)
54
+
let soup = parse html in
55
+
try soup $ "meta[property=og:image]" |> R.attribute "content" |> Option.some
56
+
with Failure _ -> None
58
+
let image_src html =
60
+
let soup = parse html in
61
+
try soup $ "link[rel=\"image_src\"]" |> R.attribute "href" |> Option.some
62
+
with Failure _ -> None
64
+
let twitter_image html =
66
+
let soup = parse html in
68
+
soup $ "meta[name=\"twitter:image\"]" |> R.attribute "content"
70
+
with Failure _ -> None
72
+
let og_description html =
74
+
let soup = parse html in
76
+
soup $ "meta[property=og:description]" |> R.attribute "content"
78
+
with Failure _ -> None
80
+
let description html =
82
+
let soup = parse html in
84
+
soup $ "meta[property=description]" |> R.attribute "content" |> Option.some
85
+
with Failure _ -> None
87
+
let preview_image html =
89
+
match og_image html with
91
+
match image_src html with
92
+
| None -> twitter_image html
96
+
match Option.map String.trim preview_image with
101
+
let description html =
102
+
let preview_image =
103
+
match og_description html with None -> description html | Some x -> Some x
105
+
match Option.map String.trim preview_image with
111
+
module Html_markdown = struct
112
+
[@@@warning "-32"] (* Suppress unused value warnings for internal utilities *)
114
+
(** HTML to Markdown converter using Lambda Soup *)
116
+
(** Extract all links from HTML content *)
117
+
let extract_links html_str =
119
+
let soup = Soup.parse html_str in
120
+
let links = Soup.select "a[href]" soup in
121
+
Soup.fold (fun acc link ->
122
+
match Soup.attribute "href" link with
124
+
let text = Soup.texts link |> String.concat "" |> String.trim in
125
+
(href, text) :: acc
131
+
(** Check if string contains any whitespace *)
132
+
let has_whitespace s =
134
+
let _ = Str.search_forward (Str.regexp "[ \t\n\r]") s 0 in
136
+
with Not_found -> false
138
+
(** Clean up excessive newlines and normalize spacing *)
139
+
let cleanup_markdown s =
140
+
(* Normalize line endings *)
141
+
let s = Str.global_replace (Str.regexp "\r\n") "\n" s in
143
+
(* Remove trailing whitespace from each line *)
144
+
let lines = String.split_on_char '\n' s in
145
+
let lines = List.map (fun line ->
146
+
(* Trim trailing spaces but preserve leading spaces for indentation *)
147
+
let len = String.length line in
148
+
let rec find_last_non_space i =
150
+
else if line.[i] = ' ' || line.[i] = '\t' then find_last_non_space (i - 1)
153
+
let last = find_last_non_space (len - 1) in
154
+
if last < 0 then ""
155
+
else String.sub line 0 (last + 1)
158
+
(* Join back and collapse excessive blank lines *)
159
+
let s = String.concat "\n" lines in
161
+
(* Replace 3+ consecutive newlines with just 2 *)
162
+
let s = Str.global_replace (Str.regexp "\n\n\n+") "\n\n" s in
164
+
(* Trim leading and trailing whitespace *)
167
+
(** Convert HTML to Markdown using state-based whitespace handling *)
168
+
let html_to_markdown html_str =
170
+
let soup = Soup.parse html_str in
171
+
let buffer = Buffer.create 256 in
173
+
(* State: track if we need to insert a space before next text *)
174
+
let need_space = ref false in
176
+
(* Get last character in buffer, if any *)
178
+
let len = Buffer.length buffer in
179
+
if len = 0 then None
180
+
else Some (Buffer.nth buffer (len - 1))
183
+
(* Add text with proper spacing *)
184
+
let add_text text =
185
+
let trimmed = String.trim text in
186
+
if trimmed <> "" then begin
187
+
(* Check if text starts with punctuation that shouldn't have space before it *)
188
+
let starts_with_punctuation =
189
+
String.length trimmed > 0 &&
190
+
(match trimmed.[0] with
191
+
| ',' | '.' | ';' | ':' | '!' | '?' | ')' | ']' | '}' -> true
195
+
(* Add space if needed, unless we're before punctuation *)
196
+
if !need_space && not starts_with_punctuation then begin
197
+
match last_char () with
198
+
| Some (' ' | '\n') -> ()
199
+
| _ -> Buffer.add_char buffer ' '
201
+
Buffer.add_string buffer trimmed;
202
+
need_space := false
206
+
(* Mark that we need space before next text (for inline elements) *)
207
+
let mark_space_needed () =
208
+
need_space := has_whitespace (Buffer.contents buffer) || Buffer.length buffer > 0
211
+
(* Process header with ID/anchor handling *)
212
+
let process_header level elem =
213
+
need_space := false;
215
+
(* Check if header contains a link with an ID fragment *)
216
+
let link_opt = Soup.select_one "a[href]" elem in
217
+
let anchor_id = match link_opt with
219
+
(match Soup.attribute "href" link with
221
+
(* Extract fragment from URL *)
222
+
let uri = Uri.of_string href in
228
+
(* Add anchor if we found an ID *)
229
+
(match anchor_id with
230
+
| Some id when id <> "" ->
231
+
Buffer.add_string buffer (Printf.sprintf "\n<a name=\"%s\"></a>\n" id)
234
+
(* Add the header marker *)
235
+
let marker = String.make level '#' in
236
+
Buffer.add_string buffer ("\n" ^ marker ^ " ");
238
+
(* Get text content, excluding link tags *)
239
+
let text = Soup.texts elem |> String.concat " " |> String.trim in
240
+
Buffer.add_string buffer text;
242
+
Buffer.add_string buffer "\n\n";
243
+
need_space := false
246
+
let rec process_node node =
247
+
match Soup.element node with
249
+
let tag = Soup.name elem in
251
+
(* Block elements - reset space tracking *)
252
+
| "h1" -> process_header 1 elem
253
+
| "h2" -> process_header 2 elem
254
+
| "h3" -> process_header 3 elem
255
+
| "h4" -> process_header 4 elem
256
+
| "h5" -> process_header 5 elem
257
+
| "h6" -> process_header 6 elem
259
+
need_space := false;
260
+
Soup.children elem |> Soup.iter process_node;
261
+
Buffer.add_string buffer "\n\n";
262
+
need_space := false
264
+
Buffer.add_string buffer "\n";
265
+
need_space := false
266
+
(* Inline elements - preserve space tracking *)
267
+
| "strong" | "b" ->
268
+
(* Add space before if needed *)
269
+
if !need_space then begin
270
+
match last_char () with
271
+
| Some (' ' | '\n') -> ()
272
+
| _ -> Buffer.add_char buffer ' '
274
+
Buffer.add_string buffer "**";
275
+
need_space := false;
276
+
Soup.children elem |> Soup.iter process_node;
277
+
Buffer.add_string buffer "**";
278
+
mark_space_needed ()
280
+
(* Add space before if needed *)
281
+
if !need_space then begin
282
+
match last_char () with
283
+
| Some (' ' | '\n') -> ()
284
+
| _ -> Buffer.add_char buffer ' '
286
+
Buffer.add_string buffer "*";
287
+
need_space := false;
288
+
Soup.children elem |> Soup.iter process_node;
289
+
Buffer.add_string buffer "*";
290
+
mark_space_needed ()
292
+
(* Add space before if needed *)
293
+
if !need_space then begin
294
+
match last_char () with
295
+
| Some (' ' | '\n') -> ()
296
+
| _ -> Buffer.add_char buffer ' '
298
+
Buffer.add_string buffer "`";
299
+
need_space := false;
300
+
Soup.children elem |> Soup.iter process_node;
301
+
Buffer.add_string buffer "`";
302
+
mark_space_needed ()
304
+
need_space := false;
305
+
Buffer.add_string buffer "\n```\n";
306
+
Soup.children elem |> Soup.iter process_node;
307
+
Buffer.add_string buffer "\n```\n\n";
308
+
need_space := false
310
+
let text = Soup.texts elem |> String.concat " " |> String.trim in
311
+
let href = Soup.attribute "href" elem in
314
+
(* Add space before link if needed *)
315
+
if !need_space then begin
316
+
match last_char () with
317
+
| Some (' ' | '\n') -> ()
318
+
| _ -> Buffer.add_char buffer ' '
320
+
need_space := false;
322
+
(* Add the link markdown *)
324
+
Buffer.add_string buffer (Printf.sprintf "<%s>" href)
326
+
Buffer.add_string buffer (Printf.sprintf "[%s](%s)" text href);
328
+
(* Mark that space may be needed after link *)
329
+
mark_space_needed ()
333
+
need_space := false;
334
+
Buffer.add_string buffer "\n";
335
+
let is_ordered = tag = "ol" in
336
+
let items = Soup.children elem |> Soup.to_list in
337
+
List.iteri (fun i item ->
338
+
match Soup.element item with
339
+
| Some li when Soup.name li = "li" ->
340
+
need_space := false;
342
+
Buffer.add_string buffer (Printf.sprintf "%d. " (i + 1))
344
+
Buffer.add_string buffer "- ";
345
+
Soup.children li |> Soup.iter process_node;
346
+
Buffer.add_string buffer "\n"
349
+
Buffer.add_string buffer "\n";
350
+
need_space := false
352
+
need_space := false;
353
+
Buffer.add_string buffer "\n> ";
354
+
Soup.children elem |> Soup.iter process_node;
355
+
Buffer.add_string buffer "\n\n";
356
+
need_space := false
358
+
(* Add space before if needed *)
359
+
if !need_space then begin
360
+
match last_char () with
361
+
| Some (' ' | '\n') -> ()
362
+
| _ -> Buffer.add_char buffer ' '
364
+
let alt = Soup.attribute "alt" elem |> Option.value ~default:"" in
365
+
let src = Soup.attribute "src" elem |> Option.value ~default:"" in
366
+
Buffer.add_string buffer (Printf.sprintf "" alt src);
367
+
need_space := false;
368
+
mark_space_needed ()
370
+
need_space := false;
371
+
Buffer.add_string buffer "\n---\n\n";
372
+
need_space := false
373
+
(* Strip these tags but keep content *)
374
+
| "div" | "span" | "article" | "section" | "header" | "footer"
375
+
| "main" | "nav" | "aside" | "figure" | "figcaption" | "details" | "summary" ->
376
+
Soup.children elem |> Soup.iter process_node
377
+
(* Ignore script, style, etc *)
378
+
| "script" | "style" | "noscript" -> ()
379
+
(* Default: just process children *)
381
+
Soup.children elem |> Soup.iter process_node)
383
+
(* Text node - handle whitespace properly *)
384
+
match Soup.leaf_text node with
386
+
(* If text is only whitespace, mark that we need space *)
387
+
let trimmed = String.trim text in
388
+
if trimmed = "" then begin
389
+
if has_whitespace text then
392
+
(* Text has content - check if it had leading/trailing whitespace *)
393
+
let had_leading_ws = has_whitespace text &&
394
+
(String.length text > 0 &&
395
+
(text.[0] = ' ' || text.[0] = '\t' || text.[0] = '\n' || text.[0] = '\r')) in
397
+
(* If had leading whitespace, mark we need space *)
398
+
if had_leading_ws then need_space := true;
400
+
(* Add the text content *)
403
+
(* If had trailing whitespace, mark we need space for next *)
404
+
let had_trailing_ws = has_whitespace text &&
405
+
(String.length text > 0 &&
406
+
let last = text.[String.length text - 1] in
407
+
last = ' ' || last = '\t' || last = '\n' || last = '\r') in
408
+
if had_trailing_ws then need_space := true
413
+
Soup.children soup |> Soup.iter process_node;
415
+
(* Clean up the result *)
416
+
let result = Buffer.contents buffer in
417
+
cleanup_markdown result
420
+
(** Convert HTML content to clean Markdown *)
421
+
let to_markdown html_str =
422
+
html_to_markdown html_str
425
+
(** {1 Feed Sources} *)
427
+
module Source = struct
433
+
let make ~name ~url = { name; url }
435
+
let name t = t.name
439
+
let make name url = { name; url } in
440
+
Jsont.Object.map ~kind:"Source" make
441
+
|> Jsont.Object.mem "name" Jsont.string ~enc:(fun s -> s.name)
442
+
|> Jsont.Object.mem "url" Jsont.string ~enc:(fun s -> s.url)
443
+
|> Jsont.Object.finish
446
+
(** {1 HTTP Session Management} *)
448
+
module Session = struct
450
+
session : (float Eio.Time.clock_ty Eio.Resource.t,
451
+
[`Generic | `Unix] Eio.Net.ty Eio.Resource.t) Requests.t;
455
+
Log.info (fun m -> m "Initializing River session");
456
+
let session = Requests.create ~sw
457
+
~default_headers:(Requests.Headers.of_list [
458
+
("User-Agent", "OCaml-River/1.0");
460
+
~follow_redirects:true
467
+
let with_session env f =
468
+
Log.info (fun m -> m "Creating River session");
469
+
Eio.Switch.run @@ fun sw ->
470
+
let client = init ~sw env in
473
+
let get_requests_session t = t.session
476
+
(** {1 Feeds and Posts} *)
478
+
module Feed = struct
479
+
type feed_content =
480
+
| Atom of Syndic.Atom.feed
481
+
| Rss2 of Syndic.Rss2.channel
482
+
| Json of Jsonfeed.t
487
+
content : feed_content;
490
+
let string_of_feed = function
493
+
| Json _ -> "JSONFeed"
495
+
let classify_feed ~xmlbase (body : string) =
496
+
Log.debug (fun m -> m "Attempting to parse feed (%d bytes)" (String.length body));
498
+
(* Quick check - does it look like JSON? *)
499
+
let looks_like_json =
500
+
String.length body > 0 &&
501
+
let first_char = String.get body 0 in
502
+
first_char = '{' || first_char = '['
505
+
if looks_like_json then (
506
+
(* Try JSONFeed first *)
507
+
Log.debug (fun m -> m "Body looks like JSON, trying JSONFeed parser");
508
+
match Jsonfeed.of_string body with
510
+
Log.debug (fun m -> m "Successfully parsed as JSONFeed");
513
+
Log.debug (fun m -> m "Not a JSONFeed: %s" (Jsont.Error.to_string err));
514
+
(* Fall through to XML parsing *)
515
+
failwith "Not a valid JSONFeed"
517
+
(* Try XML formats *)
519
+
let feed = Atom (Syndic.Atom.parse ~xmlbase (Xmlm.make_input (`String (0, body)))) in
520
+
Log.debug (fun m -> m "Successfully parsed as Atom feed");
523
+
| Syndic.Atom.Error.Error (pos, msg) -> (
524
+
Log.debug (fun m -> m "Not an Atom feed: %s at position (%d, %d)"
525
+
msg (fst pos) (snd pos));
527
+
let feed = Rss2 (Syndic.Rss2.parse ~xmlbase (Xmlm.make_input (`String (0, body)))) in
528
+
Log.debug (fun m -> m "Successfully parsed as RSS2 feed");
530
+
with Syndic.Rss2.Error.Error (pos, msg) ->
531
+
Log.err (fun m -> m "Failed to parse as RSS2: %s at position (%d, %d)"
532
+
msg (fst pos) (snd pos));
533
+
failwith "Neither Atom nor RSS2 feed")
534
+
| Not_found as e ->
535
+
Log.err (fun m -> m "Not_found exception during Atom feed parsing");
536
+
Log.err (fun m -> m "Backtrace:\n%s" (Printexc.get_backtrace ()));
539
+
Log.err (fun m -> m "Unexpected exception during feed parsing: %s"
540
+
(Printexc.to_string e));
541
+
Log.err (fun m -> m "Backtrace:\n%s" (Printexc.get_backtrace ()));
545
+
let fetch session source =
546
+
Log.info (fun m -> m "Fetching feed: %s" (Source.name source));
548
+
let xmlbase = Uri.of_string (Source.url source) in
550
+
(* Use Requests_json_api.get_result for clean Result-based error handling *)
551
+
let requests_session = Session.get_requests_session session in
553
+
match Requests_json_api.get_result requests_session (Source.url source) with
555
+
Log.info (fun m -> m "Successfully fetched %s (%d bytes)"
556
+
(Source.url source) (String.length body));
558
+
| Error (status, msg) ->
559
+
Log.err (fun m -> m "Failed to fetch feed '%s': HTTP %d - %s"
560
+
(Source.name source) status msg);
561
+
failwith (Printf.sprintf "HTTP %d: %s" status msg)
564
+
let content = classify_feed ~xmlbase response in
567
+
| Atom atom -> Text_extract.string_of_text_construct atom.Syndic.Atom.title
568
+
| Rss2 ch -> ch.Syndic.Rss2.title
569
+
| Json jsonfeed -> Jsonfeed.title jsonfeed
572
+
Log.info (fun m -> m "Successfully fetched %s feed '%s' (title: '%s')"
573
+
(string_of_feed content) (Source.name source) title);
575
+
{ source; title; content }
577
+
let source t = t.source
24
-
(* Abstract session type *)
25
-
type session = Client.t
582
+
module Post = struct
586
+
link : Uri.t option;
587
+
date : Syndic.Date.t option;
591
+
content : Soup.soup Soup.node;
592
+
mutable link_response : (string, string) result option;
593
+
tags : string list;
594
+
summary : string option;
27
-
type source = Feed.source = { name : string; url : string }
597
+
(** Generate a stable, unique ID from available data *)
598
+
let generate_id ?guid ?link ?title ?date ~feed_url () =
600
+
| Some id when id <> "" ->
601
+
(* Use explicit ID/GUID if available *)
605
+
| Some uri when Uri.to_string uri <> "" ->
606
+
(* Use permalink as ID (stable and unique) *)
609
+
(* Fallback: hash of feed_url + title + date *)
610
+
let title_str = Option.value title ~default:"" in
613
+
| Some d -> Ptime.to_rfc3339 d
616
+
let composite = Printf.sprintf "%s|%s|%s" feed_url title_str date_str in
617
+
(* Use SHA256 for stable hashing *)
618
+
Digest.string composite |> Digest.to_hex
31
-
(* Session management *)
33
-
Log.info (fun m -> m "Initializing River session");
34
-
Internal_client.create ~sw env
620
+
let resolve_links_attr ~xmlbase attr el =
621
+
Soup.R.attribute attr el
623
+
|> Syndic.XML.resolve ~xmlbase
625
+
|> fun value -> Soup.set_attribute attr value el
36
-
let with_session env f =
37
-
Log.info (fun m -> m "Creating River session");
38
-
Internal_client.with_client env f
627
+
(* Things that posts should not contain *)
628
+
let undesired_tags = [ "style"; "script" ]
629
+
let undesired_attr = [ "id" ]
40
-
(* Feed operations *)
41
-
let fetch session source =
42
-
Log.info (fun m -> m "Fetching feed: %s" source.name);
43
-
Feed.fetch session source
631
+
let html_of_text ?xmlbase s =
632
+
let soup = Soup.parse s in
633
+
let ($$) = Soup.($$) in
634
+
soup $$ "a[href]" |> Soup.iter (resolve_links_attr ~xmlbase "href");
635
+
soup $$ "img[src]" |> Soup.iter (resolve_links_attr ~xmlbase "src");
636
+
undesired_tags |> List.iter (fun tag -> soup $$ tag |> Soup.iter Soup.delete);
637
+
soup $$ "*" |> Soup.iter (fun el ->
638
+
undesired_attr |> List.iter (fun attr -> Soup.delete_attribute attr el));
45
-
let name feed = feed.Feed.name
46
-
let url feed = feed.Feed.url
641
+
(* Do not trust sites using XML for HTML content. Convert to string and parse
642
+
back. (Does not always fix bad HTML unfortunately.) *)
643
+
let html_of_syndic =
644
+
let ns_prefix _ = Some "" in
646
+
html_of_text ?xmlbase
647
+
(String.concat "" (List.map (Syndic.XML.to_string ~ns_prefix) h))
49
-
Log.info (fun m -> m "Aggregating posts from %d feed(s)" (List.length feeds));
50
-
let result = Post.get_posts feeds in
51
-
Log.info (fun m -> m "Aggregated %d posts total" (List.length result));
649
+
let string_of_option = function None -> "" | Some s -> s
54
-
let title post = post.Post.title
55
-
let link post = post.Post.link
56
-
let date post = post.Post.date
57
-
let feed post = post.Post.feed
58
-
let author post = post.Post.author
59
-
let email post = post.Post.email
60
-
let content post = Soup.to_string post.Post.content
61
-
let id post = post.Post.id
62
-
let tags post = post.Post.tags
63
-
let summary post = post.Post.summary
651
+
let post_compare p1 p2 =
652
+
(* Most recent posts first. Posts with no date are always last *)
653
+
match (p1.date, p2.date) with
654
+
| Some d1, Some d2 -> Syndic.Date.compare d2 d1
655
+
| None, Some _ -> 1
656
+
| Some _, None -> -1
65
-
let meta_description _post =
66
-
(* TODO: This requires environment for HTTP access *)
67
-
Log.debug (fun m -> m "meta_description not implemented (requires environment)");
659
+
let rec remove n l =
660
+
if n <= 0 then l else match l with [] -> [] | _ :: tl -> remove (n - 1) tl
70
-
let seo_image _post =
71
-
(* TODO: This requires environment for HTTP access *)
72
-
Log.debug (fun m -> m "seo_image not implemented (requires environment)");
662
+
let rec take n = function
664
+
| e :: tl -> if n > 0 then e :: take (n - 1) tl else []
75
-
let create_atom_entries posts =
76
-
Log.info (fun m -> m "Creating Atom entries for %d posts" (List.length posts));
77
-
Post.mk_entries posts
666
+
let post_of_atom ~(feed : Feed.t) (e : Syndic.Atom.entry) =
667
+
Log.debug (fun m -> m "Processing Atom entry: %s"
668
+
(Text_extract.string_of_text_construct e.title));
79
-
(* JSONFeed support *)
80
-
let create_jsonfeed_items posts =
81
-
Log.info (fun m -> m "Creating JSONFeed items for %d posts" (List.length posts));
82
-
Post.mk_jsonfeed_items posts
673
+
(List.find (fun l -> l.Syndic.Atom.rel = Syndic.Atom.Alternate) e.links)
675
+
with Not_found -> (
676
+
Log.debug (fun m -> m "No alternate link found, trying fallback");
678
+
| l :: _ -> Some l.href
680
+
match Uri.scheme e.id with
681
+
| Some "http" -> Some e.id
682
+
| Some "https" -> Some e.id
686
+
match e.published with Some _ -> e.published | None -> Some e.updated
689
+
match e.content with
690
+
| Some (Text s) -> html_of_text s
691
+
| Some (Html (xmlbase, s)) -> html_of_text ?xmlbase s
692
+
| Some (Xhtml (xmlbase, h)) -> html_of_syndic ?xmlbase h
693
+
| Some (Mime _) | Some (Src _) | None -> (
694
+
match e.summary with
695
+
| Some (Text s) -> html_of_text s
696
+
| Some (Html (xmlbase, s)) -> html_of_text ?xmlbase s
697
+
| Some (Xhtml (xmlbase, h)) -> html_of_syndic ?xmlbase h
698
+
| None -> Soup.parse "")
700
+
let is_valid_author_name name =
701
+
(* Filter out empty strings and placeholder values like "Unknown" *)
702
+
let trimmed = String.trim name in
703
+
trimmed <> "" && trimmed <> "Unknown"
706
+
(* Fallback chain for author:
707
+
1. Entry author (if present, not empty, and not "Unknown")
708
+
2. Feed-level author (from Atom feed metadata)
709
+
3. Feed title (from Atom feed metadata)
710
+
4. Source name (manually entered feed name) *)
712
+
let author, _ = e.authors in
713
+
let trimmed = String.trim author.name in
714
+
if is_valid_author_name author.name then trimmed
715
+
else raise Not_found (* Try feed-level author *)
716
+
with Not_found -> (
717
+
match feed.content with
718
+
| Feed.Atom atom_feed -> (
719
+
(* Try feed-level authors *)
720
+
match atom_feed.Syndic.Atom.authors with
721
+
| author :: _ when is_valid_author_name author.name ->
722
+
String.trim author.name
724
+
(* Use feed title *)
725
+
Text_extract.string_of_text_construct atom_feed.Syndic.Atom.title)
726
+
| Feed.Rss2 _ | Feed.Json _ ->
727
+
(* For RSS2 and JSONFeed, use the source name *)
728
+
Source.name feed.source)
730
+
(* Extract tags from Atom categories *)
732
+
List.map (fun cat -> cat.Syndic.Atom.term) e.categories
734
+
(* Extract summary - convert from text_construct to string *)
736
+
match e.summary with
737
+
| Some s -> Some (Text_extract.string_of_text_construct s)
740
+
(* Generate unique ID *)
741
+
let guid = Uri.to_string e.id in
742
+
let title_str = Text_extract.string_of_text_construct e.title in
744
+
generate_id ~guid ?link ~title:title_str ?date
745
+
~feed_url:(Source.url feed.source) ()
753
+
author = author_name;
756
+
link_response = None;
84
-
let create_jsonfeed ~title ?home_page_url ?feed_url ?description ?icon ?favicon posts =
85
-
Log.info (fun m -> m "Creating JSONFeed with %d posts" (List.length posts));
86
-
let items = create_jsonfeed_items posts in
87
-
Jsonfeed.create ~title ?home_page_url ?feed_url ?description ?icon ?favicon ~items ()
761
+
let post_of_rss2 ~(feed : Feed.t) it =
762
+
let title, content =
763
+
match it.Syndic.Rss2.story with
764
+
| All (t, xmlbase, d) -> (
766
+
match it.content with
767
+
| _, "" -> html_of_text ?xmlbase d
768
+
| xmlbase, c -> html_of_text ?xmlbase c ))
770
+
let xmlbase, c = it.content in
771
+
(t, html_of_text ?xmlbase c)
772
+
| Description (xmlbase, d) -> (
774
+
match it.content with
775
+
| _, "" -> html_of_text ?xmlbase d
776
+
| xmlbase, c -> html_of_text ?xmlbase c ))
778
+
(* Note: it.link is of type Uri.t option in Syndic *)
780
+
match (it.guid, it.link) with
781
+
| Some u, _ when u.permalink -> Some u.data
782
+
| _, Some _ -> it.link
784
+
(* Sometimes the guid is indicated with isPermaLink="false" but is
785
+
nonetheless the only URL we get (e.g. ocamlpro). *)
787
+
| None, None -> None
789
+
(* Extract GUID string for ID generation *)
792
+
| Some u -> Some (Uri.to_string u.data)
795
+
(* RSS2 doesn't have a categories field exposed, use empty list *)
797
+
(* RSS2 doesn't have a separate summary field, so leave it empty *)
798
+
let summary = None in
799
+
(* Generate unique ID *)
801
+
generate_id ?guid:guid_str ?link ~title ?date:it.pubDate
802
+
~feed_url:(Source.url feed.source) ()
809
+
author = Source.name feed.source;
810
+
email = string_of_option it.author;
813
+
link_response = None;
89
-
let jsonfeed_to_string ?(minify = false) jsonfeed =
90
-
match Jsonfeed.to_string ~minify jsonfeed with
92
-
| Error err -> Error (Jsont.Error.to_string err)
818
+
let post_of_jsonfeed_item ~(feed : Feed.t) (item : Jsonfeed.Item.t) =
819
+
Log.debug (fun m -> m "Processing JSONFeed item: %s"
820
+
(Option.value (Jsonfeed.Item.title item) ~default:"Untitled"));
95
-
| Atom of Syndic.Atom.feed
96
-
| Rss2 of Syndic.Rss2.channel
97
-
| JSONFeed of Jsonfeed.t
822
+
(* Extract content - prefer HTML, fall back to text *)
824
+
match Jsonfeed.Item.content item with
825
+
| `Html html -> html_of_text html
826
+
| `Text text -> html_of_text text
827
+
| `Both (html, _text) -> html_of_text html
99
-
let feed_content feed =
100
-
match feed.Feed.content with
101
-
| Feed.Atom f -> Atom f
102
-
| Feed.Rss2 ch -> Rss2 ch
103
-
| Feed.Json jf -> JSONFeed jf
830
+
(* Extract author - use first author if multiple *)
831
+
let author_name, author_email =
832
+
match Jsonfeed.Item.authors item with
833
+
| Some (first :: _) ->
834
+
let name = Jsonfeed.Author.name first |> Option.value ~default:"" in
835
+
(* JSONFeed authors don't typically have email *)
838
+
(* Fall back to feed-level authors or feed title *)
839
+
(match feed.content with
840
+
| Feed.Json jsonfeed ->
841
+
(match Jsonfeed.authors jsonfeed with
842
+
| Some (first :: _) ->
843
+
let name = Jsonfeed.Author.name first |> Option.value ~default:feed.title in
845
+
| _ -> (feed.title, ""))
846
+
| _ -> (feed.title, ""))
849
+
(* Link - use url field *)
851
+
Jsonfeed.Item.url item
852
+
|> Option.map Uri.of_string
856
+
let date = Jsonfeed.Item.date_published item in
859
+
let summary = Jsonfeed.Item.summary item in
863
+
Jsonfeed.Item.tags item
864
+
|> Option.value ~default:[]
867
+
(* Generate unique ID - JSONFeed items always have an id field (required) *)
868
+
let guid = Jsonfeed.Item.id item in
869
+
let title_str = Jsonfeed.Item.title item |> Option.value ~default:"Untitled" in
871
+
generate_id ~guid ?link ~title:title_str ?date
872
+
~feed_url:(Source.url feed.source) ()
881
+
author = author_name;
882
+
email = author_email;
884
+
link_response = None;
889
+
let posts_of_feed c =
890
+
match c.Feed.content with
892
+
let posts = List.map (post_of_atom ~feed:c) f.Syndic.Atom.entries in
893
+
Log.debug (fun m -> m "Extracted %d posts from Atom feed '%s'"
894
+
(List.length posts) (Source.name c.source));
897
+
let posts = List.map (post_of_rss2 ~feed:c) ch.Syndic.Rss2.items in
898
+
Log.debug (fun m -> m "Extracted %d posts from RSS2 feed '%s'"
899
+
(List.length posts) (Source.name c.source));
901
+
| Feed.Json jsonfeed ->
902
+
let items = Jsonfeed.items jsonfeed in
903
+
let posts = List.map (post_of_jsonfeed_item ~feed:c) items in
904
+
Log.debug (fun m -> m "Extracted %d posts from JSONFeed '%s'"
905
+
(List.length posts) (Source.name c.source));
908
+
let get_posts ?n ?(ofs = 0) planet_feeds =
909
+
Log.info (fun m -> m "Processing %d feeds for posts" (List.length planet_feeds));
911
+
let posts = List.concat @@ List.map posts_of_feed planet_feeds in
912
+
Log.debug (fun m -> m "Total posts collected: %d" (List.length posts));
914
+
let posts = List.sort post_compare posts in
915
+
Log.debug (fun m -> m "Posts sorted by date (most recent first)");
917
+
let posts = remove ofs posts in
921
+
Log.debug (fun m -> m "Returning all %d posts (offset=%d)"
922
+
(List.length posts) ofs);
925
+
let limited = take n posts in
926
+
Log.debug (fun m -> m "Returning %d posts (requested=%d, offset=%d)"
927
+
(List.length limited) n ofs);
932
+
let of_feeds feeds = get_posts feeds
934
+
let feed t = t.feed
935
+
let title t = t.title
936
+
let link t = t.link
937
+
let date t = t.date
938
+
let author t = t.author
939
+
let email t = t.email
940
+
let content t = Soup.to_string t.content
942
+
let tags t = t.tags
943
+
let summary t = t.summary
945
+
let meta_description _t =
946
+
(* TODO: This requires environment for HTTP access *)
947
+
Log.debug (fun m -> m "meta_description not implemented (requires environment)");
951
+
(* TODO: This requires environment for HTTP access *)
952
+
Log.debug (fun m -> m "seo_image not implemented (requires environment)");
956
+
(** {1 Format Conversion and Export} *)
958
+
module Format = struct
959
+
module Atom = struct
960
+
let entry_of_post post =
961
+
let content = Syndic.Atom.Html (None, Post.content post) in
963
+
[ Syndic.Atom.author ~uri:(Uri.of_string (Source.url (Feed.source (Post.feed post))))
964
+
(Source.name (Feed.source (Post.feed post))) ]
967
+
match Post.link post with
968
+
| Some l -> [ Syndic.Atom.link ~rel:Syndic.Atom.Alternate l ]
972
+
match Post.link post with
974
+
| None -> Uri.of_string (Digest.to_hex (Digest.string (Post.title post)))
976
+
let authors = (Syndic.Atom.author ~email:(Post.email post) (Post.author post), []) in
977
+
let title : Syndic.Atom.text_construct = Syndic.Atom.Text (Post.title post) in
979
+
match Post.date post with
980
+
(* Atom entry requires a date but RSS2 does not. So if a date
981
+
* is not available, just capture the current date. *)
982
+
| None -> Ptime.of_float_s (Unix.gettimeofday ()) |> Option.get
985
+
Syndic.Atom.entry ~content ~contributors ~links ~id ~authors ~title ~updated
988
+
let entries_of_posts posts = List.map entry_of_post posts
990
+
let feed_of_entries ~title ?id ?(authors = []) entries =
991
+
let feed_id = match id with
992
+
| Some i -> Uri.of_string i
993
+
| None -> Uri.of_string "urn:river:merged"
995
+
let feed_authors = List.map (fun (name, email) ->
997
+
| Some e -> Syndic.Atom.author ~email:e name
998
+
| None -> Syndic.Atom.author name
1001
+
Syndic.Atom.id = feed_id;
1002
+
title = Syndic.Atom.Text title;
1003
+
updated = Ptime.of_float_s (Unix.time ()) |> Option.get;
1005
+
authors = feed_authors;
1007
+
contributors = [];
1008
+
generator = Some {
1009
+
Syndic.Atom.version = Some "1.0";
1011
+
content = "River Feed Aggregator";
1020
+
let to_string feed =
1021
+
let output = Buffer.create 4096 in
1022
+
Syndic.Atom.output feed (`Buffer output);
1023
+
Buffer.contents output
1026
+
module Rss2 = struct
1027
+
let of_feed feed =
1028
+
match feed.Feed.content with
1029
+
| Feed.Rss2 ch -> Some ch
1033
+
module Jsonfeed = struct
1034
+
let item_of_post post =
1035
+
(* Convert HTML content back to string *)
1036
+
let html = Post.content post in
1037
+
let content = `Html html in
1039
+
(* Create author *)
1041
+
if Post.author post <> "" then
1042
+
let author = Jsonfeed.Author.create ~name:(Post.author post) () in
1049
+
Jsonfeed.Item.create
1050
+
~id:(Post.id post)
1052
+
?url:(Option.map Uri.to_string (Post.link post))
1053
+
~title:(Post.title post)
1054
+
?summary:(Post.summary post)
1055
+
?date_published:(Post.date post)
1057
+
~tags:(Post.tags post)
1060
+
let items_of_posts posts = List.map item_of_post posts
1062
+
let feed_of_items ~title ?home_page_url ?feed_url ?description ?icon ?favicon items =
1063
+
Jsonfeed.create ~title ?home_page_url ?feed_url ?description ?icon ?favicon ~items ()
1065
+
let feed_of_posts ~title ?home_page_url ?feed_url ?description ?icon ?favicon posts =
1066
+
let items = items_of_posts posts in
1067
+
feed_of_items ~title ?home_page_url ?feed_url ?description ?icon ?favicon items
1069
+
let to_string ?(minify = false) jsonfeed =
1070
+
match Jsonfeed.to_string ~minify jsonfeed with
1072
+
| Error err -> Error (Jsont.Error.to_string err)
1074
+
let of_feed feed =
1075
+
match feed.Feed.content with
1076
+
| Feed.Json jf -> Some jf
1081
+
(** {1 User Management} *)
1083
+
module User = struct
1085
+
username : string;
1086
+
fullname : string;
1087
+
email : string option;
1088
+
feeds : Source.t list;
1089
+
last_synced : string option;
1092
+
let make ~username ~fullname ?email ?(feeds = []) ?last_synced () =
1093
+
{ username; fullname; email; feeds; last_synced }
1095
+
let username t = t.username
1096
+
let fullname t = t.fullname
1097
+
let email t = t.email
1098
+
let feeds t = t.feeds
1099
+
let last_synced t = t.last_synced
1101
+
let add_feed t source =
1102
+
{ t with feeds = source :: t.feeds }
1104
+
let remove_feed t ~url =
1105
+
let feeds = List.filter (fun s -> Source.url s <> url) t.feeds in
1108
+
let set_last_synced t timestamp =
1109
+
{ t with last_synced = Some timestamp }
1112
+
let make username fullname email feeds last_synced =
1113
+
{ username; fullname; email; feeds; last_synced }
1115
+
Jsont.Object.map ~kind:"User" make
1116
+
|> Jsont.Object.mem "username" Jsont.string ~enc:(fun u -> u.username)
1117
+
|> Jsont.Object.mem "fullname" Jsont.string ~enc:(fun u -> u.fullname)
1118
+
|> Jsont.Object.opt_mem "email" Jsont.string ~enc:(fun u -> u.email)
1119
+
|> Jsont.Object.mem "feeds" (Jsont.list Source.jsont) ~enc:(fun u -> u.feeds)
1120
+
|> Jsont.Object.opt_mem "last_synced" Jsont.string ~enc:(fun u -> u.last_synced)
1121
+
|> Jsont.Object.finish
1124
+
(** {1 Feed Quality Analysis} *)
1126
+
module Quality = struct
1128
+
total_entries : int;
1129
+
entries_with_summary : int;
1130
+
entries_with_author : int;
1131
+
entries_with_date : int;
1132
+
entries_with_content : int;
1133
+
entries_with_tags : int;
1134
+
avg_content_length : float;
1135
+
min_content_length : int;
1136
+
max_content_length : int;
1137
+
posting_frequency_days : float option;
1138
+
quality_score : float;
1141
+
let make ~total_entries ~entries_with_summary ~entries_with_author
1142
+
~entries_with_date ~entries_with_content ~entries_with_tags
1143
+
~avg_content_length ~min_content_length ~max_content_length
1144
+
~posting_frequency_days ~quality_score =
1147
+
entries_with_summary;
1148
+
entries_with_author;
1149
+
entries_with_date;
1150
+
entries_with_content;
1151
+
entries_with_tags;
1152
+
avg_content_length;
1153
+
min_content_length;
1154
+
max_content_length;
1155
+
posting_frequency_days;
1159
+
let total_entries t = t.total_entries
1160
+
let entries_with_summary t = t.entries_with_summary
1161
+
let entries_with_author t = t.entries_with_author
1162
+
let entries_with_date t = t.entries_with_date
1163
+
let entries_with_content t = t.entries_with_content
1164
+
let entries_with_tags t = t.entries_with_tags
1165
+
let avg_content_length t = t.avg_content_length
1166
+
let min_content_length t = t.min_content_length
1167
+
let max_content_length t = t.max_content_length
1168
+
let posting_frequency_days t = t.posting_frequency_days
1169
+
let quality_score t = t.quality_score
1171
+
(** Get content length from an Atom entry *)
1172
+
let get_content_length (entry : Syndic.Atom.entry) =
1173
+
match entry.content with
1174
+
| Some (Syndic.Atom.Text s) -> String.length s
1175
+
| Some (Syndic.Atom.Html (_, s)) -> String.length s
1176
+
| Some (Syndic.Atom.Xhtml (_, _)) -> 0 (* Could calculate but complex *)
1177
+
| Some (Syndic.Atom.Mime _) -> 0
1178
+
| Some (Syndic.Atom.Src _) -> 0
1180
+
match entry.summary with
1181
+
| Some (Syndic.Atom.Text s) -> String.length s
1182
+
| Some (Syndic.Atom.Html (_, s)) -> String.length s
1183
+
| Some (Syndic.Atom.Xhtml (_, _)) -> 0
1186
+
(** Check if entry has non-empty summary *)
1187
+
let has_summary (entry : Syndic.Atom.entry) =
1188
+
match entry.summary with
1189
+
| Some (Syndic.Atom.Text s) when String.trim s <> "" -> true
1190
+
| Some (Syndic.Atom.Html (_, s)) when String.trim s <> "" -> true
1191
+
| Some (Syndic.Atom.Xhtml (_, _)) -> true
1194
+
(** Check if entry has author *)
1195
+
let has_author (entry : Syndic.Atom.entry) =
1196
+
let (author, _) = entry.authors in
1197
+
String.trim author.name <> ""
1199
+
(** Check if entry has content *)
1200
+
let has_content (entry : Syndic.Atom.entry) =
1201
+
get_content_length entry > 0
1203
+
(** Check if entry has tags/categories *)
1204
+
let has_tags (entry : Syndic.Atom.entry) =
1205
+
entry.categories <> []
1207
+
(** Calculate quality score from metrics *)
1208
+
let calculate_quality_score t =
1209
+
let total = float_of_int t.total_entries in
1210
+
if total = 0.0 then 0.0
1212
+
let summary_pct = float_of_int t.entries_with_summary /. total *. 100.0 in
1213
+
let author_pct = float_of_int t.entries_with_author /. total *. 100.0 in
1214
+
let date_pct = float_of_int t.entries_with_date /. total *. 100.0 in
1215
+
let content_pct = float_of_int t.entries_with_content /. total *. 100.0 in
1216
+
let tags_pct = float_of_int t.entries_with_tags /. total *. 100.0 in
1218
+
(* Weighted average: content and dates are most important *)
1220
+
(content_pct *. 0.30) +.
1221
+
(date_pct *. 0.25) +.
1222
+
(author_pct *. 0.20) +.
1223
+
(summary_pct *. 0.15) +.
1224
+
(tags_pct *. 0.10)
1228
+
let analyze entries =
1229
+
if entries = [] then
1230
+
failwith "No entries to analyze"
1232
+
let total_entries = List.length entries in
1234
+
let entries_with_summary = ref 0 in
1235
+
let entries_with_author = ref 0 in
1236
+
let entries_with_date = ref total_entries in (* All Atom entries have updated *)
1237
+
let entries_with_content = ref 0 in
1238
+
let entries_with_tags = ref 0 in
1239
+
let content_lengths = ref [] in
1240
+
let dates = ref [] in
1242
+
List.iter (fun (entry : Syndic.Atom.entry) ->
1243
+
if has_summary entry then incr entries_with_summary;
1244
+
if has_author entry then incr entries_with_author;
1245
+
if has_content entry then begin
1246
+
incr entries_with_content;
1247
+
content_lengths := get_content_length entry :: !content_lengths
1249
+
if has_tags entry then incr entries_with_tags;
1250
+
dates := entry.updated :: !dates
1253
+
(* Calculate content statistics *)
1254
+
let avg_content_length, min_content_length, max_content_length =
1255
+
if !content_lengths = [] then
1258
+
let sorted = List.sort compare !content_lengths in
1259
+
let sum = List.fold_left (+) 0 sorted in
1260
+
let avg = float_of_int sum /. float_of_int (List.length sorted) in
1261
+
let min_len = List.hd sorted in
1262
+
let max_len = List.hd (List.rev sorted) in
1263
+
(avg, min_len, max_len)
1266
+
(* Calculate posting frequency *)
1267
+
let posting_frequency_days =
1268
+
if List.length !dates < 2 then
1272
+
let timestamps = List.map Ptime.to_float_s !dates in
1273
+
let sorted_timestamps = List.sort compare timestamps in
1274
+
let first = List.hd sorted_timestamps in
1275
+
let last = List.hd (List.rev sorted_timestamps) in
1276
+
let total_days = (last -. first) /. 86400.0 in
1277
+
let num_intervals = float_of_int (List.length sorted_timestamps - 1) in
1278
+
Some (total_days /. num_intervals)
1282
+
(* Create metrics record (without quality_score first) *)
1285
+
entries_with_summary = !entries_with_summary;
1286
+
entries_with_author = !entries_with_author;
1287
+
entries_with_date = !entries_with_date;
1288
+
entries_with_content = !entries_with_content;
1289
+
entries_with_tags = !entries_with_tags;
1290
+
avg_content_length;
1291
+
min_content_length;
1292
+
max_content_length;
1293
+
posting_frequency_days;
1294
+
quality_score = 0.0; (* Placeholder *)
1297
+
(* Calculate quality score *)
1298
+
let quality_score = calculate_quality_score metrics in
1299
+
{ metrics with quality_score }
1302
+
(** {1 State Management} *)
1304
+
module State = struct
1309
+
module Paths = struct
1310
+
(** Get the users directory path *)
1311
+
let users_dir state = Eio.Path.(Xdge.state_dir state.xdg / "users")
1313
+
(** Get the feeds directory path *)
1314
+
let feeds_dir state = Eio.Path.(Xdge.state_dir state.xdg / "feeds")
1316
+
(** Get the user feeds directory path *)
1317
+
let user_feeds_dir state = Eio.Path.(feeds_dir state / "user")
1319
+
(** Get the path to a user's JSON file *)
1320
+
let user_file state username =
1321
+
Eio.Path.(users_dir state / (username ^ ".json"))
1323
+
(** Get the path to a user's Atom feed file *)
1324
+
let user_feed_file state username =
1325
+
Eio.Path.(user_feeds_dir state / (username ^ ".xml"))
1327
+
(** Ensure all necessary directories exist *)
1328
+
let ensure_directories state =
1332
+
user_feeds_dir state;
1334
+
List.iter (fun dir ->
1335
+
try Eio.Path.mkdir ~perm:0o755 dir
1336
+
with Eio.Io (Eio.Fs.E (Already_exists _), _) -> ()
1340
+
module Json = struct
1341
+
(** Decode a user from JSON string *)
1342
+
let user_of_string s =
1343
+
match Jsont_bytesrw.decode_string' User.jsont s with
1344
+
| Ok user -> Some user
1346
+
Log.err (fun m -> m "Failed to parse user JSON: %s" (Jsont.Error.to_string err));
1349
+
(** Encode a user to JSON string *)
1350
+
let user_to_string user =
1351
+
match Jsont_bytesrw.encode_string' ~format:Jsont.Indent User.jsont user with
1353
+
| Error err -> failwith ("Failed to encode user: " ^ Jsont.Error.to_string err)
1356
+
module Storage = struct
1357
+
(** Load a user from disk *)
1358
+
let load_user state username =
1359
+
let file = Paths.user_file state username in
1361
+
let content = Eio.Path.load file in
1362
+
Json.user_of_string content
1364
+
| Eio.Io (Eio.Fs.E (Not_found _), _) -> None
1366
+
Log.err (fun m -> m "Error loading user %s: %s" username (Printexc.to_string e));
1369
+
(** Save a user to disk *)
1370
+
let save_user state user =
1371
+
let file = Paths.user_file state (User.username user) in
1372
+
let json = Json.user_to_string user in
1373
+
Eio.Path.save ~create:(`Or_truncate 0o644) file json
1375
+
(** List all usernames *)
1376
+
let list_users state =
1378
+
Eio.Path.read_dir (Paths.users_dir state)
1379
+
|> List.filter_map (fun name ->
1380
+
if Filename.check_suffix name ".json" then
1381
+
Some (Filename.chop_suffix name ".json")
1386
+
(** Load existing Atom entries for a user *)
1387
+
let load_existing_posts state username =
1388
+
let file = Paths.user_feed_file state username in
1390
+
let content = Eio.Path.load file in
1391
+
(* Parse existing Atom feed *)
1392
+
let input = Xmlm.make_input (`String (0, content)) in
1393
+
let feed = Syndic.Atom.parse input in
1394
+
feed.Syndic.Atom.entries
1396
+
| Eio.Io (Eio.Fs.E (Not_found _), _) -> []
1398
+
Log.err (fun m -> m "Error loading existing posts for %s: %s"
1399
+
username (Printexc.to_string e));
1402
+
(** Save Atom entries for a user *)
1403
+
let save_atom_feed state username entries =
1404
+
let file = Paths.user_feed_file state username in
1405
+
let feed = Format.Atom.feed_of_entries ~title:username entries in
1406
+
let xml = Format.Atom.to_string feed in
1407
+
Eio.Path.save ~create:(`Or_truncate 0o644) file xml
1409
+
(** Delete a user and their feed file *)
1410
+
let delete_user state username =
1411
+
let user_file = Paths.user_file state username in
1412
+
let feed_file = Paths.user_feed_file state username in
1413
+
(try Eio.Path.unlink user_file with _ -> ());
1414
+
(try Eio.Path.unlink feed_file with _ -> ())
1417
+
module Sync = struct
1418
+
(** Merge new entries with existing ones, updating matching IDs *)
1419
+
let merge_entries ~existing ~new_entries =
1420
+
(* Create a map of new entry IDs for efficient lookup and updates *)
1421
+
let module UriMap = Map.Make(Uri) in
1422
+
let new_entries_map =
1423
+
List.fold_left (fun acc (entry : Syndic.Atom.entry) ->
1424
+
UriMap.add entry.id entry acc
1425
+
) UriMap.empty new_entries
1428
+
(* Update existing entries with new ones if IDs match, otherwise keep existing *)
1429
+
let updated_existing =
1430
+
List.filter_map (fun (entry : Syndic.Atom.entry) ->
1431
+
if UriMap.mem entry.id new_entries_map then
1432
+
None (* Will be replaced by new entry *)
1434
+
Some entry (* Keep existing entry *)
1438
+
(* Combine new entries with non-replaced existing entries *)
1439
+
let combined = new_entries @ updated_existing in
1440
+
List.sort (fun (a : Syndic.Atom.entry) (b : Syndic.Atom.entry) ->
1441
+
Ptime.compare b.updated a.updated
1444
+
(** Get current timestamp in ISO 8601 format *)
1445
+
let current_timestamp () =
1447
+
let tm = gmtime (time ()) in
1448
+
Printf.sprintf "%04d-%02d-%02dT%02d:%02d:%02dZ"
1449
+
(tm.tm_year + 1900) (tm.tm_mon + 1) tm.tm_mday
1450
+
tm.tm_hour tm.tm_min tm.tm_sec
1452
+
(** Sync feeds for a single user *)
1453
+
let sync_user session state ~username =
1454
+
match Storage.load_user state username with
1456
+
Error (Printf.sprintf "User %s not found" username)
1457
+
| Some user when User.feeds user = [] ->
1458
+
Log.info (fun m -> m "No feeds configured for user %s" username);
1461
+
Log.info (fun m -> m "Syncing feeds for user %s..." username);
1463
+
(* Fetch all feeds concurrently *)
1464
+
let fetched_feeds =
1465
+
Eio.Fiber.List.filter_map (fun source ->
1467
+
Log.info (fun m -> m " Fetching %s (%s)..."
1468
+
(Source.name source) (Source.url source));
1469
+
Some (Feed.fetch session source)
1471
+
Log.err (fun m -> m " Failed to fetch %s: %s"
1472
+
(Source.name source) (Printexc.to_string e));
1474
+
) (User.feeds user)
1477
+
if fetched_feeds = [] then begin
1478
+
Error "No feeds successfully fetched"
1480
+
(* Get posts from fetched feeds *)
1481
+
let posts = Post.of_feeds fetched_feeds in
1482
+
Log.info (fun m -> m " Found %d new posts" (List.length posts));
1484
+
(* Convert to Atom entries *)
1485
+
let new_entries = Format.Atom.entries_of_posts posts in
1487
+
(* Load existing entries *)
1488
+
let existing = Storage.load_existing_posts state username in
1489
+
Log.info (fun m -> m " Found %d existing posts" (List.length existing));
1491
+
(* Merge entries *)
1492
+
let merged = merge_entries ~existing ~new_entries in
1493
+
Log.info (fun m -> m " Total posts after merge: %d" (List.length merged));
1495
+
(* Save updated feed *)
1496
+
Storage.save_atom_feed state username merged;
1498
+
(* Update last_synced timestamp *)
1499
+
let now = current_timestamp () in
1500
+
let user = User.set_last_synced user now in
1501
+
Storage.save_user state user;
1503
+
Log.info (fun m -> m "Sync completed for user %s" username);
1508
+
module Export = struct
1509
+
(** Convert Atom entry to JSONFeed item *)
1510
+
let atom_entry_to_jsonfeed_item (entry : Syndic.Atom.entry) =
1512
+
let id = Uri.to_string entry.id in
1514
+
(* Extract title *)
1516
+
match entry.title with
1517
+
| Syndic.Atom.Text s -> Some s
1518
+
| Syndic.Atom.Html (_, s) -> Some s
1519
+
| Syndic.Atom.Xhtml (_, _) -> Some "Untitled"
1524
+
match entry.links with
1525
+
| link :: _ -> Some (Uri.to_string link.href)
1529
+
(* Extract content *)
1531
+
match entry.content with
1532
+
| Some (Syndic.Atom.Text s) -> `Text s
1533
+
| Some (Syndic.Atom.Html (_, s)) -> `Html s
1534
+
| Some (Syndic.Atom.Xhtml (_, nodes)) ->
1535
+
let html = String.concat "" (List.map Syndic.XML.to_string nodes) in
1537
+
| Some (Syndic.Atom.Mime _) | Some (Syndic.Atom.Src _) | None ->
1541
+
(* Extract summary *)
1543
+
match entry.summary with
1544
+
| Some (Syndic.Atom.Text s) when String.trim s <> "" -> Some s
1545
+
| Some (Syndic.Atom.Html (_, s)) when String.trim s <> "" -> Some s
1549
+
(* Extract authors *)
1551
+
let (author, contributors) = entry.authors in
1552
+
let author_list = author :: contributors in
1553
+
let jsonfeed_authors = List.filter_map (fun (a : Syndic.Atom.author) ->
1554
+
let name = String.trim a.name in
1555
+
if name = "" then None
1556
+
else Some (Jsonfeed.Author.create ~name ())
1558
+
if jsonfeed_authors = [] then None else Some jsonfeed_authors
1561
+
(* Extract tags *)
1563
+
match entry.categories with
1566
+
let tag_list = List.map (fun (c : Syndic.Atom.category) ->
1567
+
match c.label with
1571
+
if tag_list = [] then None else Some tag_list
1574
+
(* Create JSONFeed item *)
1575
+
Jsonfeed.Item.create
1583
+
~date_published:entry.updated
1586
+
(** Export entries as JSONFeed *)
1587
+
let export_jsonfeed ~title entries =
1588
+
let items = List.map atom_entry_to_jsonfeed_item entries in
1589
+
let feed = Jsonfeed.create ~title ~items () in
1590
+
match Jsonfeed.to_string ~minify:false feed with
1591
+
| Ok json -> Ok json
1592
+
| Error err -> Error (Printf.sprintf "Failed to serialize JSON Feed: %s" (Jsont.Error.to_string err))
1595
+
let create env ~app_name =
1596
+
let xdg = Xdge.create env#fs app_name in
1597
+
let state = { xdg } in
1598
+
Paths.ensure_directories state;
1601
+
let create_user state user =
1602
+
match Storage.load_user state (User.username user) with
1604
+
Error (Printf.sprintf "User %s already exists" (User.username user))
1606
+
Storage.save_user state user;
1607
+
Log.info (fun m -> m "User %s created" (User.username user));
1610
+
let delete_user state ~username =
1611
+
match Storage.load_user state username with
1613
+
Error (Printf.sprintf "User %s not found" username)
1615
+
Storage.delete_user state username;
1616
+
Log.info (fun m -> m "User %s deleted" username);
1619
+
let get_user state ~username =
1620
+
Storage.load_user state username
1622
+
let update_user state user =
1623
+
match Storage.load_user state (User.username user) with
1625
+
Error (Printf.sprintf "User %s not found" (User.username user))
1627
+
Storage.save_user state user;
1628
+
Log.info (fun m -> m "User %s updated" (User.username user));
1631
+
let list_users state =
1632
+
Storage.list_users state
1634
+
let sync_user env state ~username =
1635
+
Session.with_session env @@ fun session ->
1636
+
Sync.sync_user session state ~username
1638
+
let sync_all env state =
1639
+
let users = Storage.list_users state in
1640
+
if users = [] then begin
1641
+
Log.info (fun m -> m "No users to sync");
1644
+
Log.info (fun m -> m "Syncing %d users concurrently..." (List.length users));
1646
+
Session.with_session env @@ fun session ->
1648
+
Eio.Fiber.List.map (fun username ->
1649
+
match Sync.sync_user session state ~username with
1652
+
Log.err (fun m -> m "Failed to sync user %s: %s" username err);
1656
+
let success_count = List.length (List.filter (fun x -> x) results) in
1657
+
let fail_count = List.length users - success_count in
1659
+
if fail_count = 0 then
1660
+
Log.info (fun m -> m "All users synced successfully");
1662
+
Ok (success_count, fail_count)
1665
+
let get_user_posts state ~username ?limit () =
1666
+
let entries = Storage.load_existing_posts state username in
1669
+
| Some n -> List.filteri (fun i _ -> i < n) entries
1671
+
let get_all_posts state ?limit () =
1672
+
let users = Storage.list_users state in
1674
+
(* Collect all entries from all users with username tag *)
1676
+
List.concat_map (fun username ->
1677
+
let entries = Storage.load_existing_posts state username in
1678
+
List.map (fun entry -> (username, entry)) entries
1682
+
(* Sort by date (newest first) *)
1683
+
let sorted = List.sort (fun (_, a : string * Syndic.Atom.entry) (_, b) ->
1684
+
Ptime.compare b.updated a.updated
1689
+
| Some n -> List.filteri (fun i _ -> i < n) sorted
1691
+
let export_merged_feed state ~title ~format ?limit () =
1692
+
let all_posts = get_all_posts state ?limit () in
1693
+
let entries = List.map snd all_posts in
1697
+
let xml = Format.Atom.to_string (Format.Atom.feed_of_entries ~title entries) in
1700
+
if entries = [] then
1701
+
(* Empty JSONFeed *)
1702
+
let feed = Jsonfeed.create ~title ~items:[] () in
1703
+
match Jsonfeed.to_string ~minify:false feed with
1704
+
| Ok json -> Ok json
1705
+
| Error err -> Error (Printf.sprintf "Failed to serialize JSON Feed: %s" (Jsont.Error.to_string err))
1707
+
Export.export_jsonfeed ~title entries
1709
+
let analyze_user_quality state ~username =
1710
+
match Storage.load_user state username with
1712
+
Error (Printf.sprintf "User %s not found" username)
1714
+
let entries = Storage.load_existing_posts state username in
1715
+
if entries = [] then
1716
+
Error "No entries to analyze"
1718
+
Ok (Quality.analyze entries)