(** HTML to Markdown converter using Lambda Soup *)
(** Extract all links from HTML content *)
let extract_links html_str =
try
let soup = Soup.parse html_str in
let links = Soup.select "a[href]" soup in
Soup.fold (fun acc link ->
match Soup.attribute "href" link with
| Some href ->
let text = Soup.texts link |> String.concat "" |> String.trim in
(href, text) :: acc
| None -> acc
) [] links
|> List.rev
with _ -> []
(** Check if string contains any whitespace *)
let has_whitespace s =
try
let _ = Str.search_forward (Str.regexp "[ \t\n\r]") s 0 in
true
with Not_found -> false
(** Clean up excessive newlines and normalize spacing *)
let cleanup_markdown s =
(* Normalize line endings *)
let s = Str.global_replace (Str.regexp "\r\n") "\n" s in
(* Remove trailing whitespace from each line *)
let lines = String.split_on_char '\n' s in
let lines = List.map (fun line ->
(* Trim trailing spaces but preserve leading spaces for indentation *)
let len = String.length line in
let rec find_last_non_space i =
if i < 0 then -1
else if line.[i] = ' ' || line.[i] = '\t' then find_last_non_space (i - 1)
else i
in
let last = find_last_non_space (len - 1) in
if last < 0 then ""
else String.sub line 0 (last + 1)
) lines in
(* Join back and collapse excessive blank lines *)
let s = String.concat "\n" lines in
(* Replace 3+ consecutive newlines with just 2 *)
let s = Str.global_replace (Str.regexp "\n\n\n+") "\n\n" s in
(* Trim leading and trailing whitespace *)
String.trim s
(** Convert HTML to Markdown using state-based whitespace handling *)
let html_to_markdown html_str =
try
let soup = Soup.parse html_str in
let buffer = Buffer.create 256 in
(* State: track if we need to insert a space before next text *)
let need_space = ref false in
(* Get last character in buffer, if any *)
let last_char () =
let len = Buffer.length buffer in
if len = 0 then None
else Some (Buffer.nth buffer (len - 1))
in
(* Add text with proper spacing *)
let add_text text =
let trimmed = String.trim text in
if trimmed <> "" then begin
(* Check if text starts with punctuation that shouldn't have space before it *)
let starts_with_punctuation =
String.length trimmed > 0 &&
(match trimmed.[0] with
| ',' | '.' | ';' | ':' | '!' | '?' | ')' | ']' | '}' -> true
| _ -> false)
in
(* Add space if needed, unless we're before punctuation *)
if !need_space && not starts_with_punctuation then begin
match last_char () with
| Some (' ' | '\n') -> ()
| _ -> Buffer.add_char buffer ' '
end;
Buffer.add_string buffer trimmed;
need_space := false
end
in
(* Mark that we need space before next text (for inline elements) *)
let mark_space_needed () =
need_space := has_whitespace (Buffer.contents buffer) || Buffer.length buffer > 0
in
(* Process header with ID/anchor handling *)
let process_header level elem =
need_space := false;
(* Check if header contains a link with an ID fragment *)
let link_opt = Soup.select_one "a[href]" elem in
let anchor_id = match link_opt with
| Some link ->
(match Soup.attribute "href" link with
| Some href ->
(* Extract fragment from URL *)
let uri = Uri.of_string href in
Uri.fragment uri
| None -> None)
| None -> None
in
(* Add anchor if we found an ID *)
(match anchor_id with
| Some id when id <> "" ->
Buffer.add_string buffer (Printf.sprintf "\n\n" id)
| _ -> ());
(* Add the header marker *)
let marker = String.make level '#' in
Buffer.add_string buffer ("\n" ^ marker ^ " ");
(* Get text content, excluding link tags *)
let text = Soup.texts elem |> String.concat " " |> String.trim in
Buffer.add_string buffer text;
Buffer.add_string buffer "\n\n";
need_space := false
in
let rec process_node node =
match Soup.element node with
| Some elem ->
let tag = Soup.name elem in
(match tag with
(* Block elements - reset space tracking *)
| "h1" -> process_header 1 elem
| "h2" -> process_header 2 elem
| "h3" -> process_header 3 elem
| "h4" -> process_header 4 elem
| "h5" -> process_header 5 elem
| "h6" -> process_header 6 elem
| "p" ->
need_space := false;
Soup.children elem |> Soup.iter process_node;
Buffer.add_string buffer "\n\n";
need_space := false
| "br" ->
Buffer.add_string buffer "\n";
need_space := false
(* Inline elements - preserve space tracking *)
| "strong" | "b" ->
(* Add space before if needed *)
if !need_space then begin
match last_char () with
| Some (' ' | '\n') -> ()
| _ -> Buffer.add_char buffer ' '
end;
Buffer.add_string buffer "**";
need_space := false;
Soup.children elem |> Soup.iter process_node;
Buffer.add_string buffer "**";
mark_space_needed ()
| "em" | "i" ->
(* Add space before if needed *)
if !need_space then begin
match last_char () with
| Some (' ' | '\n') -> ()
| _ -> Buffer.add_char buffer ' '
end;
Buffer.add_string buffer "*";
need_space := false;
Soup.children elem |> Soup.iter process_node;
Buffer.add_string buffer "*";
mark_space_needed ()
| "code" ->
(* Add space before if needed *)
if !need_space then begin
match last_char () with
| Some (' ' | '\n') -> ()
| _ -> Buffer.add_char buffer ' '
end;
Buffer.add_string buffer "`";
need_space := false;
Soup.children elem |> Soup.iter process_node;
Buffer.add_string buffer "`";
mark_space_needed ()
| "pre" ->
need_space := false;
Buffer.add_string buffer "\n```\n";
Soup.children elem |> Soup.iter process_node;
Buffer.add_string buffer "\n```\n\n";
need_space := false
| "a" ->
let text = Soup.texts elem |> String.concat " " |> String.trim in
let href = Soup.attribute "href" elem in
(match href with
| Some href ->
(* Add space before link if needed *)
if !need_space then begin
match last_char () with
| Some (' ' | '\n') -> ()
| _ -> Buffer.add_char buffer ' '
end;
need_space := false;
(* Add the link markdown *)
if text = "" then
Buffer.add_string buffer (Printf.sprintf "<%s>" href)
else
Buffer.add_string buffer (Printf.sprintf "[%s](%s)" text href);
(* Mark that space may be needed after link *)
mark_space_needed ()
| None ->
add_text text)
| "ul" | "ol" ->
need_space := false;
Buffer.add_string buffer "\n";
let is_ordered = tag = "ol" in
let items = Soup.children elem |> Soup.to_list in
List.iteri (fun i item ->
match Soup.element item with
| Some li when Soup.name li = "li" ->
need_space := false;
if is_ordered then
Buffer.add_string buffer (Printf.sprintf "%d. " (i + 1))
else
Buffer.add_string buffer "- ";
Soup.children li |> Soup.iter process_node;
Buffer.add_string buffer "\n"
| _ -> ()
) items;
Buffer.add_string buffer "\n";
need_space := false
| "blockquote" ->
need_space := false;
Buffer.add_string buffer "\n> ";
Soup.children elem |> Soup.iter process_node;
Buffer.add_string buffer "\n\n";
need_space := false
| "img" ->
(* Add space before if needed *)
if !need_space then begin
match last_char () with
| Some (' ' | '\n') -> ()
| _ -> Buffer.add_char buffer ' '
end;
let alt = Soup.attribute "alt" elem |> Option.value ~default:"" in
let src = Soup.attribute "src" elem |> Option.value ~default:"" in
Buffer.add_string buffer (Printf.sprintf "" alt src);
need_space := false;
mark_space_needed ()
| "hr" ->
need_space := false;
Buffer.add_string buffer "\n---\n\n";
need_space := false
(* Strip these tags but keep content *)
| "div" | "span" | "article" | "section" | "header" | "footer"
| "main" | "nav" | "aside" | "figure" | "figcaption" | "details" | "summary" ->
Soup.children elem |> Soup.iter process_node
(* Ignore script, style, etc *)
| "script" | "style" | "noscript" -> ()
(* Default: just process children *)
| _ ->
Soup.children elem |> Soup.iter process_node)
| None ->
(* Text node - handle whitespace properly *)
match Soup.leaf_text node with
| Some text ->
(* If text is only whitespace, mark that we need space *)
let trimmed = String.trim text in
if trimmed = "" then begin
if has_whitespace text then
need_space := true
end else begin
(* Text has content - check if it had leading/trailing whitespace *)
let had_leading_ws = has_whitespace text &&
(String.length text > 0 &&
(text.[0] = ' ' || text.[0] = '\t' || text.[0] = '\n' || text.[0] = '\r')) in
(* If had leading whitespace, mark we need space *)
if had_leading_ws then need_space := true;
(* Add the text content *)
add_text trimmed;
(* If had trailing whitespace, mark we need space for next *)
let had_trailing_ws = has_whitespace text &&
(String.length text > 0 &&
let last = text.[String.length text - 1] in
last = ' ' || last = '\t' || last = '\n' || last = '\r') in
if had_trailing_ws then need_space := true
end
| None -> ()
in
Soup.children soup |> Soup.iter process_node;
(* Clean up the result *)
let result = Buffer.contents buffer in
cleanup_markdown result
with _ -> html_str
(** Convert HTML content to clean Markdown *)
let to_markdown html_str =
html_to_markdown html_str