stack/river/lib/markdown_converter.ml at jsont · anil.recoil.org/slop

anil.recoil.org / slop
My agentic slop goes here. Not intended for anyone else!
slop / stack / river / lib / markdown_converter.ml
at jsont 12 kB view raw
  1(** HTML to Markdown converter using Lambda Soup *)
  2
  3(** Extract all links from HTML content *)
  4let extract_links html_str =
  5  try
  6    let soup = Soup.parse html_str in
  7    let links = Soup.select "a[href]" soup in
  8    Soup.fold (fun acc link ->
  9      match Soup.attribute "href" link with
 10      | Some href ->
 11          let text = Soup.texts link |> String.concat "" |> String.trim in
 12          (href, text) :: acc
 13      | None -> acc
 14    ) [] links
 15    |> List.rev
 16  with _ -> []
 17
 18(** Check if string contains any whitespace *)
 19let has_whitespace s =
 20  try
 21    let _ = Str.search_forward (Str.regexp "[ \t\n\r]") s 0 in
 22    true
 23  with Not_found -> false
 24
 25(** Clean up excessive newlines and normalize spacing *)
 26let cleanup_markdown s =
 27  (* Normalize line endings *)
 28  let s = Str.global_replace (Str.regexp "\r\n") "\n" s in
 29
 30  (* Remove trailing whitespace from each line *)
 31  let lines = String.split_on_char '\n' s in
 32  let lines = List.map (fun line ->
 33    (* Trim trailing spaces but preserve leading spaces for indentation *)
 34    let len = String.length line in
 35    let rec find_last_non_space i =
 36      if i < 0 then -1
 37      else if line.[i] = ' ' || line.[i] = '\t' then find_last_non_space (i - 1)
 38      else i
 39    in
 40    let last = find_last_non_space (len - 1) in
 41    if last < 0 then ""
 42    else String.sub line 0 (last + 1)
 43  ) lines in
 44
 45  (* Join back and collapse excessive blank lines *)
 46  let s = String.concat "\n" lines in
 47
 48  (* Replace 3+ consecutive newlines with just 2 *)
 49  let s = Str.global_replace (Str.regexp "\n\n\n+") "\n\n" s in
 50
 51  (* Trim leading and trailing whitespace *)
 52  String.trim s
 53
 54(** Convert HTML to Markdown using state-based whitespace handling *)
 55let html_to_markdown html_str =
 56  try
 57    let soup = Soup.parse html_str in
 58    let buffer = Buffer.create 256 in
 59
 60    (* State: track if we need to insert a space before next text *)
 61    let need_space = ref false in
 62
 63    (* Get last character in buffer, if any *)
 64    let last_char () =
 65      let len = Buffer.length buffer in
 66      if len = 0 then None
 67      else Some (Buffer.nth buffer (len - 1))
 68    in
 69
 70    (* Add text with proper spacing *)
 71    let add_text text =
 72      let trimmed = String.trim text in
 73      if trimmed <> "" then begin
 74        (* Check if text starts with punctuation that shouldn't have space before it *)
 75        let starts_with_punctuation =
 76          String.length trimmed > 0 &&
 77          (match trimmed.[0] with
 78          | ',' | '.' | ';' | ':' | '!' | '?' | ')' | ']' | '}' -> true
 79          | _ -> false)
 80        in
 81
 82        (* Add space if needed, unless we're before punctuation *)
 83        if !need_space && not starts_with_punctuation then begin
 84          match last_char () with
 85          | Some (' ' | '\n') -> ()
 86          | _ -> Buffer.add_char buffer ' '
 87        end;
 88        Buffer.add_string buffer trimmed;
 89        need_space := false
 90      end
 91    in
 92
 93    (* Mark that we need space before next text (for inline elements) *)
 94    let mark_space_needed () =
 95      need_space := has_whitespace (Buffer.contents buffer) || Buffer.length buffer > 0
 96    in
 97
 98    (* Process header with ID/anchor handling *)
 99    let process_header level elem =
100      need_space := false;
101
102      (* Check if header contains a link with an ID fragment *)
103      let link_opt = Soup.select_one "a[href]" elem in
104      let anchor_id = match link_opt with
105        | Some link ->
106            (match Soup.attribute "href" link with
107            | Some href ->
108                (* Extract fragment from URL *)
109                let uri = Uri.of_string href in
110                Uri.fragment uri
111            | None -> None)
112        | None -> None
113      in
114
115      (* Add anchor if we found an ID *)
116      (match anchor_id with
117      | Some id when id <> "" ->
118          Buffer.add_string buffer (Printf.sprintf "\n<a name=\"%s\"></a>\n" id)
119      | _ -> ());
120
121      (* Add the header marker *)
122      let marker = String.make level '#' in
123      Buffer.add_string buffer ("\n" ^ marker ^ " ");
124
125      (* Get text content, excluding link tags *)
126      let text = Soup.texts elem |> String.concat " " |> String.trim in
127      Buffer.add_string buffer text;
128
129      Buffer.add_string buffer "\n\n";
130      need_space := false
131    in
132
133    let rec process_node node =
134      match Soup.element node with
135      | Some elem ->
136          let tag = Soup.name elem in
137          (match tag with
138          (* Block elements - reset space tracking *)
139          | "h1" -> process_header 1 elem
140          | "h2" -> process_header 2 elem
141          | "h3" -> process_header 3 elem
142          | "h4" -> process_header 4 elem
143          | "h5" -> process_header 5 elem
144          | "h6" -> process_header 6 elem
145          | "p" ->
146              need_space := false;
147              Soup.children elem |> Soup.iter process_node;
148              Buffer.add_string buffer "\n\n";
149              need_space := false
150          | "br" ->
151              Buffer.add_string buffer "\n";
152              need_space := false
153          (* Inline elements - preserve space tracking *)
154          | "strong" | "b" ->
155              (* Add space before if needed *)
156              if !need_space then begin
157                match last_char () with
158                | Some (' ' | '\n') -> ()
159                | _ -> Buffer.add_char buffer ' '
160              end;
161              Buffer.add_string buffer "**";
162              need_space := false;
163              Soup.children elem |> Soup.iter process_node;
164              Buffer.add_string buffer "**";
165              mark_space_needed ()
166          | "em" | "i" ->
167              (* Add space before if needed *)
168              if !need_space then begin
169                match last_char () with
170                | Some (' ' | '\n') -> ()
171                | _ -> Buffer.add_char buffer ' '
172              end;
173              Buffer.add_string buffer "*";
174              need_space := false;
175              Soup.children elem |> Soup.iter process_node;
176              Buffer.add_string buffer "*";
177              mark_space_needed ()
178          | "code" ->
179              (* Add space before if needed *)
180              if !need_space then begin
181                match last_char () with
182                | Some (' ' | '\n') -> ()
183                | _ -> Buffer.add_char buffer ' '
184              end;
185              Buffer.add_string buffer "`";
186              need_space := false;
187              Soup.children elem |> Soup.iter process_node;
188              Buffer.add_string buffer "`";
189              mark_space_needed ()
190          | "pre" ->
191              need_space := false;
192              Buffer.add_string buffer "\n```\n";
193              Soup.children elem |> Soup.iter process_node;
194              Buffer.add_string buffer "\n```\n\n";
195              need_space := false
196          | "a" ->
197              let text = Soup.texts elem |> String.concat " " |> String.trim in
198              let href = Soup.attribute "href" elem in
199              (match href with
200              | Some href ->
201                  (* Add space before link if needed *)
202                  if !need_space then begin
203                    match last_char () with
204                    | Some (' ' | '\n') -> ()
205                    | _ -> Buffer.add_char buffer ' '
206                  end;
207                  need_space := false;
208
209                  (* Add the link markdown *)
210                  if text = "" then
211                    Buffer.add_string buffer (Printf.sprintf "<%s>" href)
212                  else
213                    Buffer.add_string buffer (Printf.sprintf "[%s](%s)" text href);
214
215                  (* Mark that space may be needed after link *)
216                  mark_space_needed ()
217              | None ->
218                  add_text text)
219          | "ul" | "ol" ->
220              need_space := false;
221              Buffer.add_string buffer "\n";
222              let is_ordered = tag = "ol" in
223              let items = Soup.children elem |> Soup.to_list in
224              List.iteri (fun i item ->
225                match Soup.element item with
226                | Some li when Soup.name li = "li" ->
227                    need_space := false;
228                    if is_ordered then
229                      Buffer.add_string buffer (Printf.sprintf "%d. " (i + 1))
230                    else
231                      Buffer.add_string buffer "- ";
232                    Soup.children li |> Soup.iter process_node;
233                    Buffer.add_string buffer "\n"
234                | _ -> ()
235              ) items;
236              Buffer.add_string buffer "\n";
237              need_space := false
238          | "blockquote" ->
239              need_space := false;
240              Buffer.add_string buffer "\n> ";
241              Soup.children elem |> Soup.iter process_node;
242              Buffer.add_string buffer "\n\n";
243              need_space := false
244          | "img" ->
245              (* Add space before if needed *)
246              if !need_space then begin
247                match last_char () with
248                | Some (' ' | '\n') -> ()
249                | _ -> Buffer.add_char buffer ' '
250              end;
251              let alt = Soup.attribute "alt" elem |> Option.value ~default:"" in
252              let src = Soup.attribute "src" elem |> Option.value ~default:"" in
253              Buffer.add_string buffer (Printf.sprintf "![%s](%s)" alt src);
254              need_space := false;
255              mark_space_needed ()
256          | "hr" ->
257              need_space := false;
258              Buffer.add_string buffer "\n---\n\n";
259              need_space := false
260          (* Strip these tags but keep content *)
261          | "div" | "span" | "article" | "section" | "header" | "footer"
262          | "main" | "nav" | "aside" | "figure" | "figcaption" | "details" | "summary" ->
263              Soup.children elem |> Soup.iter process_node
264          (* Ignore script, style, etc *)
265          | "script" | "style" | "noscript" -> ()
266          (* Default: just process children *)
267          | _ ->
268              Soup.children elem |> Soup.iter process_node)
269      | None ->
270          (* Text node - handle whitespace properly *)
271          match Soup.leaf_text node with
272          | Some text ->
273              (* If text is only whitespace, mark that we need space *)
274              let trimmed = String.trim text in
275              if trimmed = "" then begin
276                if has_whitespace text then
277                  need_space := true
278              end else begin
279                (* Text has content - check if it had leading/trailing whitespace *)
280                let had_leading_ws = has_whitespace text &&
281                  (String.length text > 0 &&
282                   (text.[0] = ' ' || text.[0] = '\t' || text.[0] = '\n' || text.[0] = '\r')) in
283
284                (* If had leading whitespace, mark we need space *)
285                if had_leading_ws then need_space := true;
286
287                (* Add the text content *)
288                add_text trimmed;
289
290                (* If had trailing whitespace, mark we need space for next *)
291                let had_trailing_ws = has_whitespace text &&
292                  (String.length text > 0 &&
293                   let last = text.[String.length text - 1] in
294                   last = ' ' || last = '\t' || last = '\n' || last = '\r') in
295                if had_trailing_ws then need_space := true
296              end
297          | None -> ()
298    in
299
300    Soup.children soup |> Soup.iter process_node;
301
302    (* Clean up the result *)
303    let result = Buffer.contents buffer in
304    cleanup_markdown result
305  with _ -> html_str
306
307(** Convert HTML content to clean Markdown *)
308let to_markdown html_str =
309  html_to_markdown html_str