My agentic slop goes here. Not intended for anyone else!

more

Changed files
+119 -49
stack
+119 -49
stack/river/lib/markdown_converter.ml
···
|> List.rev
with _ -> []
-
(** Normalize whitespace while preserving boundary spaces *)
-
let normalize_whitespace s =
-
let len = String.length s in
-
if len = 0 then ""
-
else
-
(* Check if original had leading/trailing whitespace *)
-
let has_leading = len > 0 && (s.[0] = ' ' || s.[0] = '\t' || s.[0] = '\n' || s.[0] = '\r') in
-
let has_trailing = len > 0 && (s.[len-1] = ' ' || s.[len-1] = '\t' || s.[len-1] = '\n' || s.[len-1] = '\r') in
-
-
(* Collapse multiple whitespace to single space *)
-
let normalized = Str.global_replace (Str.regexp "[ \t\n\r]+") " " s in
-
let trimmed = String.trim normalized in
-
-
(* If the trimmed result is empty but there was whitespace, return single space *)
-
if trimmed = "" && (has_leading || has_trailing) then " "
-
else
-
(* Add back boundary spaces if they existed *)
-
let result =
-
(if has_leading && trimmed <> "" then " " else "") ^
-
trimmed ^
-
(if has_trailing && trimmed <> "" then " " else "")
-
in
-
result
(** Clean up excessive newlines and normalize spacing *)
let cleanup_markdown s =
···
(* Trim leading and trailing whitespace *)
String.trim s
-
(** Convert HTML to Markdown *)
let html_to_markdown html_str =
try
let soup = Soup.parse html_str in
let buffer = Buffer.create 256 in
let rec process_node node =
match Soup.element node with
| Some elem ->
let tag = Soup.name elem in
(match tag with
| "h1" ->
Buffer.add_string buffer "\n# ";
Soup.children elem |> Soup.iter process_node;
-
Buffer.add_string buffer "\n\n"
| "h2" ->
Buffer.add_string buffer "\n## ";
Soup.children elem |> Soup.iter process_node;
-
Buffer.add_string buffer "\n\n"
| "h3" ->
Buffer.add_string buffer "\n### ";
Soup.children elem |> Soup.iter process_node;
-
Buffer.add_string buffer "\n\n"
| "h4" ->
Buffer.add_string buffer "\n#### ";
Soup.children elem |> Soup.iter process_node;
-
Buffer.add_string buffer "\n\n"
| "h5" ->
Buffer.add_string buffer "\n##### ";
Soup.children elem |> Soup.iter process_node;
-
Buffer.add_string buffer "\n\n"
| "h6" ->
Buffer.add_string buffer "\n###### ";
Soup.children elem |> Soup.iter process_node;
-
Buffer.add_string buffer "\n\n"
| "p" ->
Soup.children elem |> Soup.iter process_node;
-
Buffer.add_string buffer "\n\n"
| "br" ->
-
Buffer.add_string buffer "\n"
| "strong" | "b" ->
Buffer.add_string buffer "**";
Soup.children elem |> Soup.iter process_node;
-
Buffer.add_string buffer "**"
| "em" | "i" ->
Buffer.add_string buffer "*";
Soup.children elem |> Soup.iter process_node;
-
Buffer.add_string buffer "*"
| "code" ->
Buffer.add_string buffer "`";
Soup.children elem |> Soup.iter process_node;
-
Buffer.add_string buffer "`"
| "pre" ->
Buffer.add_string buffer "\n```\n";
Soup.children elem |> Soup.iter process_node;
-
Buffer.add_string buffer "\n```\n\n"
| "a" ->
-
let text = Soup.texts elem |> String.concat "" |> String.trim in
-
(match Soup.attribute "href" elem with
| Some href ->
if text = "" then
Buffer.add_string buffer (Printf.sprintf "<%s>" href)
else
-
Buffer.add_string buffer (Printf.sprintf "[%s](%s)" text href)
-
| None -> Buffer.add_string buffer text)
| "ul" | "ol" ->
Buffer.add_string buffer "\n";
let is_ordered = tag = "ol" in
let items = Soup.children elem |> Soup.to_list in
List.iteri (fun i item ->
match Soup.element item with
| Some li when Soup.name li = "li" ->
if is_ordered then
Buffer.add_string buffer (Printf.sprintf "%d. " (i + 1))
else
···
Buffer.add_string buffer "\n"
| _ -> ()
) items;
-
Buffer.add_string buffer "\n"
| "blockquote" ->
Buffer.add_string buffer "\n> ";
Soup.children elem |> Soup.iter process_node;
-
Buffer.add_string buffer "\n\n"
| "img" ->
let alt = Soup.attribute "alt" elem |> Option.value ~default:"" in
let src = Soup.attribute "src" elem |> Option.value ~default:"" in
-
Buffer.add_string buffer (Printf.sprintf "![%s](%s)" alt src)
| "hr" ->
-
Buffer.add_string buffer "\n---\n\n"
(* Strip these tags but keep content *)
| "div" | "span" | "article" | "section" | "header" | "footer"
-
| "main" | "nav" | "aside" | "figure" | "figcaption" ->
Soup.children elem |> Soup.iter process_node
(* Ignore script, style, etc *)
| "script" | "style" | "noscript" -> ()
···
| _ ->
Soup.children elem |> Soup.iter process_node)
| None ->
-
(* Text node *)
match Soup.leaf_text node with
| Some text ->
-
let normalized = normalize_whitespace text in
-
if normalized <> "" then
-
Buffer.add_string buffer normalized
| None -> ()
in
···
|> List.rev
with _ -> []
+
(** Check if string contains any whitespace *)
+
let has_whitespace s =
+
try
+
let _ = Str.search_forward (Str.regexp "[ \t\n\r]") s 0 in
+
true
+
with Not_found -> false
(** Clean up excessive newlines and normalize spacing *)
let cleanup_markdown s =
···
(* Trim leading and trailing whitespace *)
String.trim s
+
(** Convert HTML to Markdown using state-based whitespace handling *)
let html_to_markdown html_str =
try
let soup = Soup.parse html_str in
let buffer = Buffer.create 256 in
+
(* State: track if we need to insert a space before next text *)
+
let need_space = ref false in
+
+
(* Get last character in buffer, if any *)
+
let last_char () =
+
let len = Buffer.length buffer in
+
if len = 0 then None
+
else Some (Buffer.nth buffer (len - 1))
+
in
+
+
(* Add text with proper spacing *)
+
let add_text text =
+
let trimmed = String.trim text in
+
if trimmed <> "" then begin
+
(* Add space if needed and last char isn't already whitespace *)
+
if !need_space then begin
+
match last_char () with
+
| Some (' ' | '\n') -> ()
+
| _ -> Buffer.add_char buffer ' '
+
end;
+
Buffer.add_string buffer trimmed;
+
need_space := false
+
end
+
in
+
+
(* Mark that we need space before next text (for inline elements) *)
+
let mark_space_needed () =
+
need_space := has_whitespace (Buffer.contents buffer) || Buffer.length buffer > 0
+
in
+
let rec process_node node =
match Soup.element node with
| Some elem ->
let tag = Soup.name elem in
(match tag with
+
(* Block elements - reset space tracking *)
| "h1" ->
+
need_space := false;
Buffer.add_string buffer "\n# ";
Soup.children elem |> Soup.iter process_node;
+
Buffer.add_string buffer "\n\n";
+
need_space := false
| "h2" ->
+
need_space := false;
Buffer.add_string buffer "\n## ";
Soup.children elem |> Soup.iter process_node;
+
Buffer.add_string buffer "\n\n";
+
need_space := false
| "h3" ->
+
need_space := false;
Buffer.add_string buffer "\n### ";
Soup.children elem |> Soup.iter process_node;
+
Buffer.add_string buffer "\n\n";
+
need_space := false
| "h4" ->
+
need_space := false;
Buffer.add_string buffer "\n#### ";
Soup.children elem |> Soup.iter process_node;
+
Buffer.add_string buffer "\n\n";
+
need_space := false
| "h5" ->
+
need_space := false;
Buffer.add_string buffer "\n##### ";
Soup.children elem |> Soup.iter process_node;
+
Buffer.add_string buffer "\n\n";
+
need_space := false
| "h6" ->
+
need_space := false;
Buffer.add_string buffer "\n###### ";
Soup.children elem |> Soup.iter process_node;
+
Buffer.add_string buffer "\n\n";
+
need_space := false
| "p" ->
+
need_space := false;
Soup.children elem |> Soup.iter process_node;
+
Buffer.add_string buffer "\n\n";
+
need_space := false
| "br" ->
+
Buffer.add_string buffer "\n";
+
need_space := false
+
(* Inline elements - preserve space tracking *)
| "strong" | "b" ->
Buffer.add_string buffer "**";
+
need_space := false;
Soup.children elem |> Soup.iter process_node;
+
Buffer.add_string buffer "**";
+
mark_space_needed ()
| "em" | "i" ->
Buffer.add_string buffer "*";
+
need_space := false;
Soup.children elem |> Soup.iter process_node;
+
Buffer.add_string buffer "*";
+
mark_space_needed ()
| "code" ->
Buffer.add_string buffer "`";
+
need_space := false;
Soup.children elem |> Soup.iter process_node;
+
Buffer.add_string buffer "`";
+
mark_space_needed ()
| "pre" ->
+
need_space := false;
Buffer.add_string buffer "\n```\n";
Soup.children elem |> Soup.iter process_node;
+
Buffer.add_string buffer "\n```\n\n";
+
need_space := false
| "a" ->
+
let text = Soup.texts elem |> String.concat " " |> String.trim in
+
let href = Soup.attribute "href" elem in
+
(match href with
| Some href ->
if text = "" then
Buffer.add_string buffer (Printf.sprintf "<%s>" href)
else
+
Buffer.add_string buffer (Printf.sprintf "[%s](%s)" text href);
+
mark_space_needed ()
+
| None ->
+
add_text text)
| "ul" | "ol" ->
+
need_space := false;
Buffer.add_string buffer "\n";
let is_ordered = tag = "ol" in
let items = Soup.children elem |> Soup.to_list in
List.iteri (fun i item ->
match Soup.element item with
| Some li when Soup.name li = "li" ->
+
need_space := false;
if is_ordered then
Buffer.add_string buffer (Printf.sprintf "%d. " (i + 1))
else
···
Buffer.add_string buffer "\n"
| _ -> ()
) items;
+
Buffer.add_string buffer "\n";
+
need_space := false
| "blockquote" ->
+
need_space := false;
Buffer.add_string buffer "\n> ";
Soup.children elem |> Soup.iter process_node;
+
Buffer.add_string buffer "\n\n";
+
need_space := false
| "img" ->
let alt = Soup.attribute "alt" elem |> Option.value ~default:"" in
let src = Soup.attribute "src" elem |> Option.value ~default:"" in
+
Buffer.add_string buffer (Printf.sprintf "![%s](%s)" alt src);
+
mark_space_needed ()
| "hr" ->
+
need_space := false;
+
Buffer.add_string buffer "\n---\n\n";
+
need_space := false
(* Strip these tags but keep content *)
| "div" | "span" | "article" | "section" | "header" | "footer"
+
| "main" | "nav" | "aside" | "figure" | "figcaption" | "details" | "summary" ->
Soup.children elem |> Soup.iter process_node
(* Ignore script, style, etc *)
| "script" | "style" | "noscript" -> ()
···
| _ ->
Soup.children elem |> Soup.iter process_node)
| None ->
+
(* Text node - handle whitespace properly *)
match Soup.leaf_text node with
| Some text ->
+
(* If text is only whitespace, mark that we need space *)
+
let trimmed = String.trim text in
+
if trimmed = "" then begin
+
if has_whitespace text then
+
need_space := true
+
end else begin
+
(* Text has content - check if it had leading/trailing whitespace *)
+
let had_leading_ws = has_whitespace text &&
+
(String.length text > 0 &&
+
(text.[0] = ' ' || text.[0] = '\t' || text.[0] = '\n' || text.[0] = '\r')) in
+
+
(* If had leading whitespace, mark we need space *)
+
if had_leading_ws then need_space := true;
+
+
(* Add the text content *)
+
add_text trimmed;
+
+
(* If had trailing whitespace, mark we need space for next *)
+
let had_trailing_ws = has_whitespace text &&
+
(String.length text > 0 &&
+
let last = text.[String.length text - 1] in
+
last = ' ' || last = '\t' || last = '\n' || last = '\r') in
+
if had_trailing_ws then need_space := true
+
end
| None -> ()
in