(** HTML to Markdown converter using Lambda Soup *) (** Extract all links from HTML content *) let extract_links html_str = try let soup = Soup.parse html_str in let links = Soup.select "a[href]" soup in Soup.fold (fun acc link -> match Soup.attribute "href" link with | Some href -> let text = Soup.texts link |> String.concat "" |> String.trim in (href, text) :: acc | None -> acc ) [] links |> List.rev with _ -> [] (** Check if string contains any whitespace *) let has_whitespace s = try let _ = Str.search_forward (Str.regexp "[ \t\n\r]") s 0 in true with Not_found -> false (** Clean up excessive newlines and normalize spacing *) let cleanup_markdown s = (* Normalize line endings *) let s = Str.global_replace (Str.regexp "\r\n") "\n" s in (* Remove trailing whitespace from each line *) let lines = String.split_on_char '\n' s in let lines = List.map (fun line -> (* Trim trailing spaces but preserve leading spaces for indentation *) let len = String.length line in let rec find_last_non_space i = if i < 0 then -1 else if line.[i] = ' ' || line.[i] = '\t' then find_last_non_space (i - 1) else i in let last = find_last_non_space (len - 1) in if last < 0 then "" else String.sub line 0 (last + 1) ) lines in (* Join back and collapse excessive blank lines *) let s = String.concat "\n" lines in (* Replace 3+ consecutive newlines with just 2 *) let s = Str.global_replace (Str.regexp "\n\n\n+") "\n\n" s in (* Trim leading and trailing whitespace *) String.trim s (** Convert HTML to Markdown using state-based whitespace handling *) let html_to_markdown html_str = try let soup = Soup.parse html_str in let buffer = Buffer.create 256 in (* State: track if we need to insert a space before next text *) let need_space = ref false in (* Get last character in buffer, if any *) let last_char () = let len = Buffer.length buffer in if len = 0 then None else Some (Buffer.nth buffer (len - 1)) in (* Add text with proper spacing *) let add_text text = let trimmed = String.trim text in if trimmed <> "" then begin (* Check if text starts with punctuation that shouldn't have space before it *) let starts_with_punctuation = String.length trimmed > 0 && (match trimmed.[0] with | ',' | '.' | ';' | ':' | '!' | '?' | ')' | ']' | '}' -> true | _ -> false) in (* Add space if needed, unless we're before punctuation *) if !need_space && not starts_with_punctuation then begin match last_char () with | Some (' ' | '\n') -> () | _ -> Buffer.add_char buffer ' ' end; Buffer.add_string buffer trimmed; need_space := false end in (* Mark that we need space before next text (for inline elements) *) let mark_space_needed () = need_space := has_whitespace (Buffer.contents buffer) || Buffer.length buffer > 0 in (* Process header with ID/anchor handling *) let process_header level elem = need_space := false; (* Check if header contains a link with an ID fragment *) let link_opt = Soup.select_one "a[href]" elem in let anchor_id = match link_opt with | Some link -> (match Soup.attribute "href" link with | Some href -> (* Extract fragment from URL *) let uri = Uri.of_string href in Uri.fragment uri | None -> None) | None -> None in (* Add anchor if we found an ID *) (match anchor_id with | Some id when id <> "" -> Buffer.add_string buffer (Printf.sprintf "\n\n" id) | _ -> ()); (* Add the header marker *) let marker = String.make level '#' in Buffer.add_string buffer ("\n" ^ marker ^ " "); (* Get text content, excluding link tags *) let text = Soup.texts elem |> String.concat " " |> String.trim in Buffer.add_string buffer text; Buffer.add_string buffer "\n\n"; need_space := false in let rec process_node node = match Soup.element node with | Some elem -> let tag = Soup.name elem in (match tag with (* Block elements - reset space tracking *) | "h1" -> process_header 1 elem | "h2" -> process_header 2 elem | "h3" -> process_header 3 elem | "h4" -> process_header 4 elem | "h5" -> process_header 5 elem | "h6" -> process_header 6 elem | "p" -> need_space := false; Soup.children elem |> Soup.iter process_node; Buffer.add_string buffer "\n\n"; need_space := false | "br" -> Buffer.add_string buffer "\n"; need_space := false (* Inline elements - preserve space tracking *) | "strong" | "b" -> (* Add space before if needed *) if !need_space then begin match last_char () with | Some (' ' | '\n') -> () | _ -> Buffer.add_char buffer ' ' end; Buffer.add_string buffer "**"; need_space := false; Soup.children elem |> Soup.iter process_node; Buffer.add_string buffer "**"; mark_space_needed () | "em" | "i" -> (* Add space before if needed *) if !need_space then begin match last_char () with | Some (' ' | '\n') -> () | _ -> Buffer.add_char buffer ' ' end; Buffer.add_string buffer "*"; need_space := false; Soup.children elem |> Soup.iter process_node; Buffer.add_string buffer "*"; mark_space_needed () | "code" -> (* Add space before if needed *) if !need_space then begin match last_char () with | Some (' ' | '\n') -> () | _ -> Buffer.add_char buffer ' ' end; Buffer.add_string buffer "`"; need_space := false; Soup.children elem |> Soup.iter process_node; Buffer.add_string buffer "`"; mark_space_needed () | "pre" -> need_space := false; Buffer.add_string buffer "\n```\n"; Soup.children elem |> Soup.iter process_node; Buffer.add_string buffer "\n```\n\n"; need_space := false | "a" -> let text = Soup.texts elem |> String.concat " " |> String.trim in let href = Soup.attribute "href" elem in (match href with | Some href -> (* Add space before link if needed *) if !need_space then begin match last_char () with | Some (' ' | '\n') -> () | _ -> Buffer.add_char buffer ' ' end; need_space := false; (* Add the link markdown *) if text = "" then Buffer.add_string buffer (Printf.sprintf "<%s>" href) else Buffer.add_string buffer (Printf.sprintf "[%s](%s)" text href); (* Mark that space may be needed after link *) mark_space_needed () | None -> add_text text) | "ul" | "ol" -> need_space := false; Buffer.add_string buffer "\n"; let is_ordered = tag = "ol" in let items = Soup.children elem |> Soup.to_list in List.iteri (fun i item -> match Soup.element item with | Some li when Soup.name li = "li" -> need_space := false; if is_ordered then Buffer.add_string buffer (Printf.sprintf "%d. " (i + 1)) else Buffer.add_string buffer "- "; Soup.children li |> Soup.iter process_node; Buffer.add_string buffer "\n" | _ -> () ) items; Buffer.add_string buffer "\n"; need_space := false | "blockquote" -> need_space := false; Buffer.add_string buffer "\n> "; Soup.children elem |> Soup.iter process_node; Buffer.add_string buffer "\n\n"; need_space := false | "img" -> (* Add space before if needed *) if !need_space then begin match last_char () with | Some (' ' | '\n') -> () | _ -> Buffer.add_char buffer ' ' end; let alt = Soup.attribute "alt" elem |> Option.value ~default:"" in let src = Soup.attribute "src" elem |> Option.value ~default:"" in Buffer.add_string buffer (Printf.sprintf "![%s](%s)" alt src); need_space := false; mark_space_needed () | "hr" -> need_space := false; Buffer.add_string buffer "\n---\n\n"; need_space := false (* Strip these tags but keep content *) | "div" | "span" | "article" | "section" | "header" | "footer" | "main" | "nav" | "aside" | "figure" | "figcaption" | "details" | "summary" -> Soup.children elem |> Soup.iter process_node (* Ignore script, style, etc *) | "script" | "style" | "noscript" -> () (* Default: just process children *) | _ -> Soup.children elem |> Soup.iter process_node) | None -> (* Text node - handle whitespace properly *) match Soup.leaf_text node with | Some text -> (* If text is only whitespace, mark that we need space *) let trimmed = String.trim text in if trimmed = "" then begin if has_whitespace text then need_space := true end else begin (* Text has content - check if it had leading/trailing whitespace *) let had_leading_ws = has_whitespace text && (String.length text > 0 && (text.[0] = ' ' || text.[0] = '\t' || text.[0] = '\n' || text.[0] = '\r')) in (* If had leading whitespace, mark we need space *) if had_leading_ws then need_space := true; (* Add the text content *) add_text trimmed; (* If had trailing whitespace, mark we need space for next *) let had_trailing_ws = has_whitespace text && (String.length text > 0 && let last = text.[String.length text - 1] in last = ' ' || last = '\t' || last = '\n' || last = '\r') in if had_trailing_ws then need_space := true end | None -> () in Soup.children soup |> Soup.iter process_node; (* Clean up the result *) let result = Buffer.contents buffer in cleanup_markdown result with _ -> html_str (** Convert HTML content to clean Markdown *) let to_markdown html_str = html_to_markdown html_str