(* * Copyright (c) 2014, OCaml.org project * Copyright (c) 2015 KC Sivaramakrishnan * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. *) (** Feed fetching and parsing. *) let src = Logs.Src.create "river" ~doc:"River RSS/Atom aggregator" module Log = (val Logs.src_log src : Logs.LOG) type feed_content = River_jsonfeed.t type t = { source : Source.t; title : string; content : feed_content; original_format : string; (* "Atom", "RSS2", or "JSONFeed" *) } let classify_feed ~xmlbase (body : string) = Log.debug (fun m -> m "Attempting to parse feed (%d bytes)" (String.length body)); (* Quick check - does it look like JSON? *) let looks_like_json = String.length body > 0 && let first_char = String.get body 0 in first_char = '{' || first_char = '[' in if looks_like_json then ( (* Try JSONFeed first *) Log.debug (fun m -> m "Body looks like JSON, trying JSONFeed parser"); match Jsonfeed.of_string body with | Ok jsonfeed -> Log.debug (fun m -> m "Successfully parsed as JSONFeed"); (* Wrap plain JSONFeed with River_jsonfeed (no extensions needed) *) let river_jsonfeed = { River_jsonfeed.feed = jsonfeed; extension = None } in (river_jsonfeed, "JSONFeed") | Error err -> let err_str = Jsont.Error.to_string err in Log.debug (fun m -> m "Not a JSONFeed: %s" err_str); (* Fall through to XML parsing *) failwith (Printf.sprintf "Not a valid JSONFeed: %s" err_str) ) else ( (* Try XML formats *) try let atom_feed = Syndic.Atom.parse ~xmlbase (Xmlm.make_input (`String (0, body))) in Log.debug (fun m -> m "Successfully parsed as Atom feed, converting to JSONFeed"); (* Convert Atom to JSONFeed with extensions *) let river_jsonfeed = River_jsonfeed.of_atom atom_feed in (river_jsonfeed, "Atom") with | Syndic.Atom.Error.Error (pos, msg) -> ( Log.debug (fun m -> m "Not an Atom feed: %s at position (%d, %d)" msg (fst pos) (snd pos)); try let rss2_channel = Syndic.Rss2.parse ~xmlbase (Xmlm.make_input (`String (0, body))) in Log.debug (fun m -> m "Successfully parsed as RSS2 feed, converting to JSONFeed"); (* Convert RSS2 to JSONFeed *) let river_jsonfeed = River_jsonfeed.of_rss2 rss2_channel in (river_jsonfeed, "RSS2") with Syndic.Rss2.Error.Error (pos, msg) -> Log.err (fun m -> m "Failed to parse as RSS2: %s at position (%d, %d)" msg (fst pos) (snd pos)); failwith "Neither Atom nor RSS2 feed") | Not_found as e -> Log.err (fun m -> m "Not_found exception during Atom feed parsing"); Log.err (fun m -> m "Backtrace:\n%s" (Printexc.get_backtrace ())); raise e | e -> Log.err (fun m -> m "Unexpected exception during feed parsing: %s" (Printexc.to_string e)); Log.err (fun m -> m "Backtrace:\n%s" (Printexc.get_backtrace ())); raise e ) let fetch session source = Log.info (fun m -> m "Fetching feed: %s" (Source.name source)); let xmlbase = Uri.of_string (Source.url source) in (* Use Requests_json_api.get_result for clean Result-based error handling *) let requests_session = Session.get_requests_session session in let response = match Requests_json_api.get_result requests_session (Source.url source) with | Ok body -> Log.info (fun m -> m "Successfully fetched %s (%d bytes)" (Source.url source) (String.length body)); body | Error (status, msg) -> let truncated_msg = if String.length msg > 200 then String.sub msg 0 200 ^ "..." else msg in Log.err (fun m -> m "Failed to fetch feed '%s': HTTP %d - %s" (Source.name source) status truncated_msg); failwith (Printf.sprintf "HTTP %d: %s" status truncated_msg) in let (content, original_format) = try classify_feed ~xmlbase response with Failure msg -> Log.err (fun m -> m "Failed to parse feed '%s' (%s): %s" (Source.name source) (Source.url source) msg); raise (Failure msg) in let title = Jsonfeed.title content.River_jsonfeed.feed in Log.info (fun m -> m "Successfully fetched %s feed '%s' (title: '%s'), converted to JSONFeed" original_format (Source.name source) title); { source; title; content; original_format } let source t = t.source let content t = t.content let title t = t.title